web-to-markdown 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/bin/web-to-markdown.js +113 -45
- package/dist/index.js +97 -31
- package/package.json +2 -1
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
|
|
3
3
|
// src/cli.ts
|
|
4
|
-
import { resolve, dirname } from "path";
|
|
5
|
-
import { existsSync,
|
|
4
|
+
import { resolve, dirname, sep } from "path";
|
|
5
|
+
import { existsSync, readFileSync } from "fs";
|
|
6
6
|
import { realpath, writeFile } from "fs/promises";
|
|
7
7
|
import { fileURLToPath } from "url";
|
|
8
8
|
import { Command } from "commander";
|
|
@@ -47,10 +47,10 @@ function expandIPv6(ip) {
|
|
|
47
47
|
} else {
|
|
48
48
|
groups = ip.split(":");
|
|
49
49
|
}
|
|
50
|
-
return groups.map((g) => g.toLowerCase()).join(":");
|
|
50
|
+
return groups.map((g) => g.toLowerCase().padStart(4, "0")).join(":");
|
|
51
51
|
}
|
|
52
52
|
function normalizeIP(hostname) {
|
|
53
|
-
const bare = hostname.replace(/^\[|\]$/g, "");
|
|
53
|
+
const bare = hostname.replace(/^\[|\]$/g, "").replace(/%.*$/, "");
|
|
54
54
|
if (isIP(bare) === 4) return bare;
|
|
55
55
|
if (isIP(bare) === 6) {
|
|
56
56
|
const ffmpDotted = bare.match(/^::ffff:(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})$/i);
|
|
@@ -62,7 +62,9 @@ function normalizeIP(hostname) {
|
|
|
62
62
|
return `${high >> 8 & 255}.${high & 255}.${low >> 8 & 255}.${low & 255}`;
|
|
63
63
|
}
|
|
64
64
|
const fullFfmpHex = expandIPv6(bare);
|
|
65
|
-
const fullFfmpMatch = fullFfmpHex.match(
|
|
65
|
+
const fullFfmpMatch = fullFfmpHex.match(
|
|
66
|
+
/^0000:0000:0000:0000:0000:ffff:([0-9a-f]{4}):([0-9a-f]{4})$/i
|
|
67
|
+
);
|
|
66
68
|
if (fullFfmpMatch) {
|
|
67
69
|
const high = parseInt(fullFfmpMatch[1], 16);
|
|
68
70
|
const low = parseInt(fullFfmpMatch[2], 16);
|
|
@@ -119,20 +121,21 @@ function isPrivateIP(ip) {
|
|
|
119
121
|
if (ip.startsWith("198.51.100.")) return true;
|
|
120
122
|
if (ip.startsWith("203.0.113.")) return true;
|
|
121
123
|
if (/^198\.(18|19)\./.test(ip)) return true;
|
|
122
|
-
if (
|
|
124
|
+
if (ip.includes(".") && !ip.includes(":")) {
|
|
125
|
+
const firstOctet = parseInt(ip.split(".")[0], 10);
|
|
126
|
+
if (firstOctet >= 224) return true;
|
|
127
|
+
}
|
|
123
128
|
const expanded = ip.includes("::") ? expandIPv6(ip) : ip.toLowerCase();
|
|
124
|
-
if (ip === "::1" || ip === "::" || expanded === "
|
|
129
|
+
if (ip === "::1" || ip === "::" || expanded === "0000:0000:0000:0000:0000:0000:0000:0001" || expanded === "0000:0000:0000:0000:0000:0000:0000:0000")
|
|
125
130
|
return true;
|
|
126
131
|
if (expanded.startsWith("fc") || expanded.startsWith("fd")) return true;
|
|
127
|
-
if (
|
|
128
|
-
|
|
129
|
-
if (expanded.startsWith("fec0:") || expanded.startsWith("fec") || expanded.startsWith("fed") || expanded.startsWith("fee") || expanded.startsWith("fef"))
|
|
130
|
-
return true;
|
|
132
|
+
if (/^fe[89ab][0-9a-f]:/.test(expanded)) return true;
|
|
133
|
+
if (/^fe[cdef][0-9a-f]:/.test(expanded)) return true;
|
|
131
134
|
if (expanded.startsWith("ff")) return true;
|
|
132
|
-
if (expanded.startsWith("
|
|
135
|
+
if (expanded.startsWith("0064:ff9b:")) return true;
|
|
133
136
|
if (expanded.startsWith("2002:")) return true;
|
|
134
|
-
if (expanded.startsWith("2001:
|
|
135
|
-
if (expanded.startsWith("2001:
|
|
137
|
+
if (expanded.startsWith("2001:0db8:")) return true;
|
|
138
|
+
if (expanded.startsWith("2001:0000:")) return true;
|
|
136
139
|
return false;
|
|
137
140
|
}
|
|
138
141
|
|
|
@@ -182,6 +185,8 @@ var BLOCKED_HOSTNAME_SUFFIXES = [
|
|
|
182
185
|
".internal",
|
|
183
186
|
".local",
|
|
184
187
|
".localhost",
|
|
188
|
+
".localdomain",
|
|
189
|
+
".intranet",
|
|
185
190
|
".corp",
|
|
186
191
|
".home",
|
|
187
192
|
".lan"
|
|
@@ -202,13 +207,18 @@ async function resolveAndValidateHostname(hostname) {
|
|
|
202
207
|
return normalizedIP;
|
|
203
208
|
}
|
|
204
209
|
try {
|
|
205
|
-
const
|
|
206
|
-
if (
|
|
207
|
-
throw new
|
|
208
|
-
|
|
209
|
-
|
|
210
|
+
const results = await lookup(hostname, { all: true });
|
|
211
|
+
if (results.length === 0) {
|
|
212
|
+
throw new NetworkError(`DNS lookup for "${hostname}" returned no results. Check the URL.`);
|
|
213
|
+
}
|
|
214
|
+
for (const { address } of results) {
|
|
215
|
+
if (isPrivateIP(address)) {
|
|
216
|
+
throw new SSRFError(
|
|
217
|
+
`Blocked request to "${hostname}" \u2014 resolves to private IP. Requests to internal networks are not allowed.`
|
|
218
|
+
);
|
|
219
|
+
}
|
|
210
220
|
}
|
|
211
|
-
return address;
|
|
221
|
+
return results[0].address;
|
|
212
222
|
} catch (err) {
|
|
213
223
|
if (err instanceof SSRFError) throw err;
|
|
214
224
|
if (err instanceof Error) {
|
|
@@ -233,7 +243,7 @@ async function validateUrl(url) {
|
|
|
233
243
|
`Unsupported protocol "${parsed.protocol}". Only HTTP and HTTPS are supported.`
|
|
234
244
|
);
|
|
235
245
|
}
|
|
236
|
-
const hostname = parsed.hostname.toLowerCase();
|
|
246
|
+
const hostname = parsed.hostname.toLowerCase().replace(/\.$/, "");
|
|
237
247
|
if (BLOCKED_HOSTNAMES.has(hostname)) {
|
|
238
248
|
throw new SSRFError(
|
|
239
249
|
`Blocked request to "${hostname}". Requests to internal hosts are not allowed.`
|
|
@@ -248,6 +258,20 @@ async function validateUrl(url) {
|
|
|
248
258
|
}
|
|
249
259
|
return resolveAndValidateHostname(hostname);
|
|
250
260
|
}
|
|
261
|
+
function extractCharset(headers) {
|
|
262
|
+
const raw = headers["content-type"];
|
|
263
|
+
const contentType = typeof raw === "string" ? raw : "";
|
|
264
|
+
const match = contentType.match(/charset\s*=\s*["']?([^"';,\s]+)["']?/i);
|
|
265
|
+
return match ? match[1].toLowerCase() : null;
|
|
266
|
+
}
|
|
267
|
+
function decodeBody(buffer, charset) {
|
|
268
|
+
const encoding = charset || "utf-8";
|
|
269
|
+
try {
|
|
270
|
+
return new TextDecoder(encoding).decode(buffer);
|
|
271
|
+
} catch {
|
|
272
|
+
return buffer.toString("utf-8");
|
|
273
|
+
}
|
|
274
|
+
}
|
|
251
275
|
var HTML_CONTENT_TYPES = [
|
|
252
276
|
"text/html",
|
|
253
277
|
"application/xhtml+xml",
|
|
@@ -282,11 +306,14 @@ function pinnedRequest(url, resolvedIP, timeout) {
|
|
|
282
306
|
const parsed = new URL(url);
|
|
283
307
|
const isHttps = parsed.protocol === "https:";
|
|
284
308
|
const requestFn = isHttps ? httpsRequest : httpRequest;
|
|
309
|
+
const abortController = new AbortController();
|
|
310
|
+
const totalTimer = setTimeout(() => abortController.abort(), timeout);
|
|
285
311
|
const req = requestFn(
|
|
286
312
|
url,
|
|
287
313
|
{
|
|
288
314
|
method: "GET",
|
|
289
315
|
timeout,
|
|
316
|
+
signal: abortController.signal,
|
|
290
317
|
lookup: createPinnedLookup(resolvedIP),
|
|
291
318
|
headers: {
|
|
292
319
|
"User-Agent": "Mozilla/5.0 (compatible; web-to-markdown/0.1; +https://github.com/nidhi-singh02/mark-it-down)",
|
|
@@ -300,6 +327,7 @@ function pinnedRequest(url, resolvedIP, timeout) {
|
|
|
300
327
|
res.on("data", (chunk) => {
|
|
301
328
|
totalBytes += chunk.length;
|
|
302
329
|
if (totalBytes > MAX_RESPONSE_SIZE) {
|
|
330
|
+
clearTimeout(totalTimer);
|
|
303
331
|
req.destroy();
|
|
304
332
|
reject(
|
|
305
333
|
new ContentError(
|
|
@@ -311,24 +339,36 @@ function pinnedRequest(url, resolvedIP, timeout) {
|
|
|
311
339
|
chunks.push(chunk);
|
|
312
340
|
});
|
|
313
341
|
res.on("end", () => {
|
|
342
|
+
clearTimeout(totalTimer);
|
|
314
343
|
const bodyBuffer = Buffer.concat(chunks);
|
|
344
|
+
const responseHeaders = res.headers;
|
|
345
|
+
const charset = extractCharset(responseHeaders);
|
|
315
346
|
resolve2({
|
|
316
347
|
status: res.statusCode || 0,
|
|
317
|
-
headers:
|
|
318
|
-
body: bodyBuffer
|
|
348
|
+
headers: responseHeaders,
|
|
349
|
+
body: decodeBody(bodyBuffer, charset),
|
|
319
350
|
bodyBuffer,
|
|
320
351
|
responseUrl: url
|
|
321
352
|
});
|
|
322
353
|
});
|
|
323
|
-
res.on("error",
|
|
354
|
+
res.on("error", (err) => {
|
|
355
|
+
clearTimeout(totalTimer);
|
|
356
|
+
reject(err);
|
|
357
|
+
});
|
|
324
358
|
}
|
|
325
359
|
);
|
|
326
360
|
req.on("timeout", () => {
|
|
361
|
+
clearTimeout(totalTimer);
|
|
327
362
|
req.destroy();
|
|
328
363
|
reject(new NetworkError("Request timed out."));
|
|
329
364
|
});
|
|
330
365
|
req.on("error", (err) => {
|
|
331
|
-
|
|
366
|
+
clearTimeout(totalTimer);
|
|
367
|
+
if (abortController.signal.aborted) {
|
|
368
|
+
reject(new NetworkError("Request timed out (total deadline exceeded)."));
|
|
369
|
+
} else {
|
|
370
|
+
reject(new NetworkError(`Request failed: ${err.message}`, void 0, { cause: err }));
|
|
371
|
+
}
|
|
332
372
|
});
|
|
333
373
|
req.end();
|
|
334
374
|
});
|
|
@@ -381,24 +421,47 @@ async function fetchWithBrowser(url, resolvedIP, timeout) {
|
|
|
381
421
|
);
|
|
382
422
|
}
|
|
383
423
|
const parsed = new URL(url);
|
|
384
|
-
const
|
|
424
|
+
const pinnedAddr = resolvedIP.includes(":") ? `[${resolvedIP}]` : resolvedIP;
|
|
425
|
+
const hostResolverRule = `MAP ${parsed.hostname} ${pinnedAddr}`;
|
|
426
|
+
const chromiumArgs = [
|
|
427
|
+
"--disable-features=WebSockets",
|
|
428
|
+
`--host-resolver-rules=${hostResolverRule}`,
|
|
429
|
+
// Avoid /dev/shm exhaustion in Docker containers (default 64MB)
|
|
430
|
+
"--disable-dev-shm-usage"
|
|
431
|
+
];
|
|
432
|
+
if (process.platform === "linux" && process.getuid?.() === 0) {
|
|
433
|
+
chromiumArgs.push("--no-sandbox");
|
|
434
|
+
}
|
|
385
435
|
const browser = await playwright.chromium.launch({
|
|
386
436
|
headless: true,
|
|
387
|
-
args:
|
|
437
|
+
args: chromiumArgs
|
|
388
438
|
});
|
|
389
439
|
try {
|
|
390
440
|
const context = await browser.newContext({
|
|
391
441
|
userAgent: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
|
392
442
|
});
|
|
393
443
|
const page = await context.newPage();
|
|
444
|
+
const initialHostname = parsed.hostname;
|
|
394
445
|
await context.route("**/*", async (route) => {
|
|
395
446
|
const requestUrl = route.request().url();
|
|
396
447
|
try {
|
|
397
|
-
const
|
|
398
|
-
if (
|
|
399
|
-
await
|
|
448
|
+
const reqParsed = new URL(requestUrl);
|
|
449
|
+
if (reqParsed.protocol !== "http:" && reqParsed.protocol !== "https:") {
|
|
450
|
+
await route.continue();
|
|
451
|
+
return;
|
|
452
|
+
}
|
|
453
|
+
const subResolvedIP = await validateUrl(requestUrl);
|
|
454
|
+
if (reqParsed.hostname === initialHostname) {
|
|
455
|
+
await route.continue();
|
|
456
|
+
return;
|
|
400
457
|
}
|
|
401
|
-
await
|
|
458
|
+
const response = await pinnedRequest(requestUrl, subResolvedIP, safTimeout);
|
|
459
|
+
const contentType = typeof response.headers["content-type"] === "string" ? response.headers["content-type"] : "application/octet-stream";
|
|
460
|
+
await route.fulfill({
|
|
461
|
+
status: response.status,
|
|
462
|
+
contentType,
|
|
463
|
+
body: response.bodyBuffer
|
|
464
|
+
});
|
|
402
465
|
} catch {
|
|
403
466
|
await route.abort("blockedbyclient");
|
|
404
467
|
}
|
|
@@ -416,7 +479,10 @@ async function fetchWithBrowser(url, resolvedIP, timeout) {
|
|
|
416
479
|
}
|
|
417
480
|
return { html, finalUrl: page.url() };
|
|
418
481
|
} finally {
|
|
419
|
-
|
|
482
|
+
try {
|
|
483
|
+
await browser.close();
|
|
484
|
+
} catch {
|
|
485
|
+
}
|
|
420
486
|
}
|
|
421
487
|
}
|
|
422
488
|
async function fetchRawText(url, timeout) {
|
|
@@ -580,7 +646,7 @@ function extractMdx(html, url) {
|
|
|
580
646
|
return extractMintlifyMdx(html, url);
|
|
581
647
|
}
|
|
582
648
|
function processRawMdx(mdx, url) {
|
|
583
|
-
const { body, metadata } = parseFrontmatter(mdx);
|
|
649
|
+
const { body, metadata } = parseFrontmatter(mdx.replace(/\r\n/g, "\n"));
|
|
584
650
|
let markdown = stripMdxComponents(body);
|
|
585
651
|
markdown = resolveRelativeUrls(markdown, url);
|
|
586
652
|
markdown = markdown.replace(/\n{3,}/g, "\n\n");
|
|
@@ -1497,7 +1563,8 @@ function getPackageVersion() {
|
|
|
1497
1563
|
async function validateOutputPath(outputPath) {
|
|
1498
1564
|
const resolved = resolve(outputPath);
|
|
1499
1565
|
const cwd = process.cwd();
|
|
1500
|
-
|
|
1566
|
+
const cwdPrefix = cwd.endsWith(sep) ? cwd : cwd + sep;
|
|
1567
|
+
if (!resolved.startsWith(cwdPrefix) && resolved !== cwd) {
|
|
1501
1568
|
throw new Error(
|
|
1502
1569
|
`Output path "${outputPath}" resolves outside the current directory.
|
|
1503
1570
|
Resolved to: ${resolved}
|
|
@@ -1507,17 +1574,15 @@ For safety, output files must be within the working directory.`
|
|
|
1507
1574
|
let checkPath = resolved;
|
|
1508
1575
|
while (checkPath !== cwd && checkPath !== dirname(checkPath)) {
|
|
1509
1576
|
if (existsSync(checkPath)) {
|
|
1510
|
-
const
|
|
1511
|
-
|
|
1512
|
-
|
|
1513
|
-
|
|
1514
|
-
|
|
1515
|
-
|
|
1516
|
-
|
|
1517
|
-
|
|
1518
|
-
|
|
1519
|
-
);
|
|
1520
|
-
}
|
|
1577
|
+
const realTarget = await realpath(checkPath);
|
|
1578
|
+
const realCwd = await realpath(cwd);
|
|
1579
|
+
const realCwdPrefix = realCwd.endsWith(sep) ? realCwd : realCwd + sep;
|
|
1580
|
+
if (!realTarget.startsWith(realCwdPrefix) && realTarget !== realCwd) {
|
|
1581
|
+
throw new Error(
|
|
1582
|
+
`Output path "${outputPath}" resolves outside the current directory.
|
|
1583
|
+
"${checkPath}" resolves to "${realTarget}".
|
|
1584
|
+
For safety, output files must not escape the working directory via symlinks or junctions.`
|
|
1585
|
+
);
|
|
1521
1586
|
}
|
|
1522
1587
|
break;
|
|
1523
1588
|
}
|
|
@@ -1583,6 +1648,9 @@ Received ${signal}, shutting down\u2026
|
|
|
1583
1648
|
};
|
|
1584
1649
|
process.on("SIGINT", onSignal);
|
|
1585
1650
|
process.on("SIGTERM", onSignal);
|
|
1651
|
+
if (process.platform === "win32") {
|
|
1652
|
+
process.on("SIGBREAK", onSignal);
|
|
1653
|
+
}
|
|
1586
1654
|
await program.parseAsync(process.argv);
|
|
1587
1655
|
}
|
|
1588
1656
|
|
package/dist/index.js
CHANGED
|
@@ -38,10 +38,10 @@ function expandIPv6(ip) {
|
|
|
38
38
|
} else {
|
|
39
39
|
groups = ip.split(":");
|
|
40
40
|
}
|
|
41
|
-
return groups.map((g) => g.toLowerCase()).join(":");
|
|
41
|
+
return groups.map((g) => g.toLowerCase().padStart(4, "0")).join(":");
|
|
42
42
|
}
|
|
43
43
|
function normalizeIP(hostname) {
|
|
44
|
-
const bare = hostname.replace(/^\[|\]$/g, "");
|
|
44
|
+
const bare = hostname.replace(/^\[|\]$/g, "").replace(/%.*$/, "");
|
|
45
45
|
if (isIP(bare) === 4) return bare;
|
|
46
46
|
if (isIP(bare) === 6) {
|
|
47
47
|
const ffmpDotted = bare.match(/^::ffff:(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})$/i);
|
|
@@ -53,7 +53,9 @@ function normalizeIP(hostname) {
|
|
|
53
53
|
return `${high >> 8 & 255}.${high & 255}.${low >> 8 & 255}.${low & 255}`;
|
|
54
54
|
}
|
|
55
55
|
const fullFfmpHex = expandIPv6(bare);
|
|
56
|
-
const fullFfmpMatch = fullFfmpHex.match(
|
|
56
|
+
const fullFfmpMatch = fullFfmpHex.match(
|
|
57
|
+
/^0000:0000:0000:0000:0000:ffff:([0-9a-f]{4}):([0-9a-f]{4})$/i
|
|
58
|
+
);
|
|
57
59
|
if (fullFfmpMatch) {
|
|
58
60
|
const high = parseInt(fullFfmpMatch[1], 16);
|
|
59
61
|
const low = parseInt(fullFfmpMatch[2], 16);
|
|
@@ -110,20 +112,21 @@ function isPrivateIP(ip) {
|
|
|
110
112
|
if (ip.startsWith("198.51.100.")) return true;
|
|
111
113
|
if (ip.startsWith("203.0.113.")) return true;
|
|
112
114
|
if (/^198\.(18|19)\./.test(ip)) return true;
|
|
113
|
-
if (
|
|
115
|
+
if (ip.includes(".") && !ip.includes(":")) {
|
|
116
|
+
const firstOctet = parseInt(ip.split(".")[0], 10);
|
|
117
|
+
if (firstOctet >= 224) return true;
|
|
118
|
+
}
|
|
114
119
|
const expanded = ip.includes("::") ? expandIPv6(ip) : ip.toLowerCase();
|
|
115
|
-
if (ip === "::1" || ip === "::" || expanded === "
|
|
120
|
+
if (ip === "::1" || ip === "::" || expanded === "0000:0000:0000:0000:0000:0000:0000:0001" || expanded === "0000:0000:0000:0000:0000:0000:0000:0000")
|
|
116
121
|
return true;
|
|
117
122
|
if (expanded.startsWith("fc") || expanded.startsWith("fd")) return true;
|
|
118
|
-
if (
|
|
119
|
-
|
|
120
|
-
if (expanded.startsWith("fec0:") || expanded.startsWith("fec") || expanded.startsWith("fed") || expanded.startsWith("fee") || expanded.startsWith("fef"))
|
|
121
|
-
return true;
|
|
123
|
+
if (/^fe[89ab][0-9a-f]:/.test(expanded)) return true;
|
|
124
|
+
if (/^fe[cdef][0-9a-f]:/.test(expanded)) return true;
|
|
122
125
|
if (expanded.startsWith("ff")) return true;
|
|
123
|
-
if (expanded.startsWith("
|
|
126
|
+
if (expanded.startsWith("0064:ff9b:")) return true;
|
|
124
127
|
if (expanded.startsWith("2002:")) return true;
|
|
125
|
-
if (expanded.startsWith("2001:
|
|
126
|
-
if (expanded.startsWith("2001:
|
|
128
|
+
if (expanded.startsWith("2001:0db8:")) return true;
|
|
129
|
+
if (expanded.startsWith("2001:0000:")) return true;
|
|
127
130
|
return false;
|
|
128
131
|
}
|
|
129
132
|
|
|
@@ -173,6 +176,8 @@ var BLOCKED_HOSTNAME_SUFFIXES = [
|
|
|
173
176
|
".internal",
|
|
174
177
|
".local",
|
|
175
178
|
".localhost",
|
|
179
|
+
".localdomain",
|
|
180
|
+
".intranet",
|
|
176
181
|
".corp",
|
|
177
182
|
".home",
|
|
178
183
|
".lan"
|
|
@@ -193,13 +198,18 @@ async function resolveAndValidateHostname(hostname) {
|
|
|
193
198
|
return normalizedIP;
|
|
194
199
|
}
|
|
195
200
|
try {
|
|
196
|
-
const
|
|
197
|
-
if (
|
|
198
|
-
throw new
|
|
199
|
-
`Blocked request to "${hostname}" \u2014 resolves to private IP. Requests to internal networks are not allowed.`
|
|
200
|
-
);
|
|
201
|
+
const results = await lookup(hostname, { all: true });
|
|
202
|
+
if (results.length === 0) {
|
|
203
|
+
throw new NetworkError(`DNS lookup for "${hostname}" returned no results. Check the URL.`);
|
|
201
204
|
}
|
|
202
|
-
|
|
205
|
+
for (const { address } of results) {
|
|
206
|
+
if (isPrivateIP(address)) {
|
|
207
|
+
throw new SSRFError(
|
|
208
|
+
`Blocked request to "${hostname}" \u2014 resolves to private IP. Requests to internal networks are not allowed.`
|
|
209
|
+
);
|
|
210
|
+
}
|
|
211
|
+
}
|
|
212
|
+
return results[0].address;
|
|
203
213
|
} catch (err) {
|
|
204
214
|
if (err instanceof SSRFError) throw err;
|
|
205
215
|
if (err instanceof Error) {
|
|
@@ -224,7 +234,7 @@ async function validateUrl(url) {
|
|
|
224
234
|
`Unsupported protocol "${parsed.protocol}". Only HTTP and HTTPS are supported.`
|
|
225
235
|
);
|
|
226
236
|
}
|
|
227
|
-
const hostname = parsed.hostname.toLowerCase();
|
|
237
|
+
const hostname = parsed.hostname.toLowerCase().replace(/\.$/, "");
|
|
228
238
|
if (BLOCKED_HOSTNAMES.has(hostname)) {
|
|
229
239
|
throw new SSRFError(
|
|
230
240
|
`Blocked request to "${hostname}". Requests to internal hosts are not allowed.`
|
|
@@ -239,6 +249,20 @@ async function validateUrl(url) {
|
|
|
239
249
|
}
|
|
240
250
|
return resolveAndValidateHostname(hostname);
|
|
241
251
|
}
|
|
252
|
+
function extractCharset(headers) {
|
|
253
|
+
const raw = headers["content-type"];
|
|
254
|
+
const contentType = typeof raw === "string" ? raw : "";
|
|
255
|
+
const match = contentType.match(/charset\s*=\s*["']?([^"';,\s]+)["']?/i);
|
|
256
|
+
return match ? match[1].toLowerCase() : null;
|
|
257
|
+
}
|
|
258
|
+
function decodeBody(buffer, charset) {
|
|
259
|
+
const encoding = charset || "utf-8";
|
|
260
|
+
try {
|
|
261
|
+
return new TextDecoder(encoding).decode(buffer);
|
|
262
|
+
} catch {
|
|
263
|
+
return buffer.toString("utf-8");
|
|
264
|
+
}
|
|
265
|
+
}
|
|
242
266
|
var HTML_CONTENT_TYPES = [
|
|
243
267
|
"text/html",
|
|
244
268
|
"application/xhtml+xml",
|
|
@@ -273,11 +297,14 @@ function pinnedRequest(url, resolvedIP, timeout) {
|
|
|
273
297
|
const parsed = new URL(url);
|
|
274
298
|
const isHttps = parsed.protocol === "https:";
|
|
275
299
|
const requestFn = isHttps ? httpsRequest : httpRequest;
|
|
300
|
+
const abortController = new AbortController();
|
|
301
|
+
const totalTimer = setTimeout(() => abortController.abort(), timeout);
|
|
276
302
|
const req = requestFn(
|
|
277
303
|
url,
|
|
278
304
|
{
|
|
279
305
|
method: "GET",
|
|
280
306
|
timeout,
|
|
307
|
+
signal: abortController.signal,
|
|
281
308
|
lookup: createPinnedLookup(resolvedIP),
|
|
282
309
|
headers: {
|
|
283
310
|
"User-Agent": "Mozilla/5.0 (compatible; web-to-markdown/0.1; +https://github.com/nidhi-singh02/mark-it-down)",
|
|
@@ -291,6 +318,7 @@ function pinnedRequest(url, resolvedIP, timeout) {
|
|
|
291
318
|
res.on("data", (chunk) => {
|
|
292
319
|
totalBytes += chunk.length;
|
|
293
320
|
if (totalBytes > MAX_RESPONSE_SIZE) {
|
|
321
|
+
clearTimeout(totalTimer);
|
|
294
322
|
req.destroy();
|
|
295
323
|
reject(
|
|
296
324
|
new ContentError(
|
|
@@ -302,24 +330,36 @@ function pinnedRequest(url, resolvedIP, timeout) {
|
|
|
302
330
|
chunks.push(chunk);
|
|
303
331
|
});
|
|
304
332
|
res.on("end", () => {
|
|
333
|
+
clearTimeout(totalTimer);
|
|
305
334
|
const bodyBuffer = Buffer.concat(chunks);
|
|
335
|
+
const responseHeaders = res.headers;
|
|
336
|
+
const charset = extractCharset(responseHeaders);
|
|
306
337
|
resolve({
|
|
307
338
|
status: res.statusCode || 0,
|
|
308
|
-
headers:
|
|
309
|
-
body: bodyBuffer
|
|
339
|
+
headers: responseHeaders,
|
|
340
|
+
body: decodeBody(bodyBuffer, charset),
|
|
310
341
|
bodyBuffer,
|
|
311
342
|
responseUrl: url
|
|
312
343
|
});
|
|
313
344
|
});
|
|
314
|
-
res.on("error",
|
|
345
|
+
res.on("error", (err) => {
|
|
346
|
+
clearTimeout(totalTimer);
|
|
347
|
+
reject(err);
|
|
348
|
+
});
|
|
315
349
|
}
|
|
316
350
|
);
|
|
317
351
|
req.on("timeout", () => {
|
|
352
|
+
clearTimeout(totalTimer);
|
|
318
353
|
req.destroy();
|
|
319
354
|
reject(new NetworkError("Request timed out."));
|
|
320
355
|
});
|
|
321
356
|
req.on("error", (err) => {
|
|
322
|
-
|
|
357
|
+
clearTimeout(totalTimer);
|
|
358
|
+
if (abortController.signal.aborted) {
|
|
359
|
+
reject(new NetworkError("Request timed out (total deadline exceeded)."));
|
|
360
|
+
} else {
|
|
361
|
+
reject(new NetworkError(`Request failed: ${err.message}`, void 0, { cause: err }));
|
|
362
|
+
}
|
|
323
363
|
});
|
|
324
364
|
req.end();
|
|
325
365
|
});
|
|
@@ -372,24 +412,47 @@ async function fetchWithBrowser(url, resolvedIP, timeout) {
|
|
|
372
412
|
);
|
|
373
413
|
}
|
|
374
414
|
const parsed = new URL(url);
|
|
375
|
-
const
|
|
415
|
+
const pinnedAddr = resolvedIP.includes(":") ? `[${resolvedIP}]` : resolvedIP;
|
|
416
|
+
const hostResolverRule = `MAP ${parsed.hostname} ${pinnedAddr}`;
|
|
417
|
+
const chromiumArgs = [
|
|
418
|
+
"--disable-features=WebSockets",
|
|
419
|
+
`--host-resolver-rules=${hostResolverRule}`,
|
|
420
|
+
// Avoid /dev/shm exhaustion in Docker containers (default 64MB)
|
|
421
|
+
"--disable-dev-shm-usage"
|
|
422
|
+
];
|
|
423
|
+
if (process.platform === "linux" && process.getuid?.() === 0) {
|
|
424
|
+
chromiumArgs.push("--no-sandbox");
|
|
425
|
+
}
|
|
376
426
|
const browser = await playwright.chromium.launch({
|
|
377
427
|
headless: true,
|
|
378
|
-
args:
|
|
428
|
+
args: chromiumArgs
|
|
379
429
|
});
|
|
380
430
|
try {
|
|
381
431
|
const context = await browser.newContext({
|
|
382
432
|
userAgent: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
|
383
433
|
});
|
|
384
434
|
const page = await context.newPage();
|
|
435
|
+
const initialHostname = parsed.hostname;
|
|
385
436
|
await context.route("**/*", async (route) => {
|
|
386
437
|
const requestUrl = route.request().url();
|
|
387
438
|
try {
|
|
388
|
-
const
|
|
389
|
-
if (
|
|
390
|
-
await
|
|
439
|
+
const reqParsed = new URL(requestUrl);
|
|
440
|
+
if (reqParsed.protocol !== "http:" && reqParsed.protocol !== "https:") {
|
|
441
|
+
await route.continue();
|
|
442
|
+
return;
|
|
443
|
+
}
|
|
444
|
+
const subResolvedIP = await validateUrl(requestUrl);
|
|
445
|
+
if (reqParsed.hostname === initialHostname) {
|
|
446
|
+
await route.continue();
|
|
447
|
+
return;
|
|
391
448
|
}
|
|
392
|
-
await
|
|
449
|
+
const response = await pinnedRequest(requestUrl, subResolvedIP, safTimeout);
|
|
450
|
+
const contentType = typeof response.headers["content-type"] === "string" ? response.headers["content-type"] : "application/octet-stream";
|
|
451
|
+
await route.fulfill({
|
|
452
|
+
status: response.status,
|
|
453
|
+
contentType,
|
|
454
|
+
body: response.bodyBuffer
|
|
455
|
+
});
|
|
393
456
|
} catch {
|
|
394
457
|
await route.abort("blockedbyclient");
|
|
395
458
|
}
|
|
@@ -407,7 +470,10 @@ async function fetchWithBrowser(url, resolvedIP, timeout) {
|
|
|
407
470
|
}
|
|
408
471
|
return { html, finalUrl: page.url() };
|
|
409
472
|
} finally {
|
|
410
|
-
|
|
473
|
+
try {
|
|
474
|
+
await browser.close();
|
|
475
|
+
} catch {
|
|
476
|
+
}
|
|
411
477
|
}
|
|
412
478
|
}
|
|
413
479
|
async function fetchRawText(url, timeout) {
|
|
@@ -571,7 +637,7 @@ function extractMdx(html, url) {
|
|
|
571
637
|
return extractMintlifyMdx(html, url);
|
|
572
638
|
}
|
|
573
639
|
function processRawMdx(mdx, url) {
|
|
574
|
-
const { body, metadata } = parseFrontmatter(mdx);
|
|
640
|
+
const { body, metadata } = parseFrontmatter(mdx.replace(/\r\n/g, "\n"));
|
|
575
641
|
let markdown = stripMdxComponents(body);
|
|
576
642
|
markdown = resolveRelativeUrls(markdown, url);
|
|
577
643
|
markdown = markdown.replace(/\n{3,}/g, "\n\n");
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "web-to-markdown",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.1",
|
|
4
4
|
"description": "Convert any web page to clean Markdown. Built for developers and LLM pipelines.",
|
|
5
5
|
"author": "Nidhi Singh",
|
|
6
6
|
"repository": {
|
|
@@ -66,6 +66,7 @@
|
|
|
66
66
|
"@types/node": "^22.0.0",
|
|
67
67
|
"@types/turndown": "^5.0.5",
|
|
68
68
|
"eslint": "^10.0.2",
|
|
69
|
+
"js-tiktoken": "^1.0.21",
|
|
69
70
|
"prettier": "^3.8.1",
|
|
70
71
|
"tsup": "^8.5.0",
|
|
71
72
|
"typescript": "^5.7.0",
|