web-to-markdown 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,8 +1,8 @@
1
1
  #!/usr/bin/env node
2
2
 
3
3
  // src/cli.ts
4
- import { resolve, dirname } from "path";
5
- import { existsSync, lstatSync, readFileSync } from "fs";
4
+ import { resolve, dirname, sep } from "path";
5
+ import { existsSync, readFileSync } from "fs";
6
6
  import { realpath, writeFile } from "fs/promises";
7
7
  import { fileURLToPath } from "url";
8
8
  import { Command } from "commander";
@@ -47,10 +47,10 @@ function expandIPv6(ip) {
47
47
  } else {
48
48
  groups = ip.split(":");
49
49
  }
50
- return groups.map((g) => g.toLowerCase()).join(":");
50
+ return groups.map((g) => g.toLowerCase().padStart(4, "0")).join(":");
51
51
  }
52
52
  function normalizeIP(hostname) {
53
- const bare = hostname.replace(/^\[|\]$/g, "");
53
+ const bare = hostname.replace(/^\[|\]$/g, "").replace(/%.*$/, "");
54
54
  if (isIP(bare) === 4) return bare;
55
55
  if (isIP(bare) === 6) {
56
56
  const ffmpDotted = bare.match(/^::ffff:(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})$/i);
@@ -62,7 +62,9 @@ function normalizeIP(hostname) {
62
62
  return `${high >> 8 & 255}.${high & 255}.${low >> 8 & 255}.${low & 255}`;
63
63
  }
64
64
  const fullFfmpHex = expandIPv6(bare);
65
- const fullFfmpMatch = fullFfmpHex.match(/^0:0:0:0:0:ffff:([0-9a-f]{1,4}):([0-9a-f]{1,4})$/i);
65
+ const fullFfmpMatch = fullFfmpHex.match(
66
+ /^0000:0000:0000:0000:0000:ffff:([0-9a-f]{4}):([0-9a-f]{4})$/i
67
+ );
66
68
  if (fullFfmpMatch) {
67
69
  const high = parseInt(fullFfmpMatch[1], 16);
68
70
  const low = parseInt(fullFfmpMatch[2], 16);
@@ -119,20 +121,21 @@ function isPrivateIP(ip) {
119
121
  if (ip.startsWith("198.51.100.")) return true;
120
122
  if (ip.startsWith("203.0.113.")) return true;
121
123
  if (/^198\.(18|19)\./.test(ip)) return true;
122
- if (/^24[0-9]\./.test(ip) || ip.startsWith("255.")) return true;
124
+ if (ip.includes(".") && !ip.includes(":")) {
125
+ const firstOctet = parseInt(ip.split(".")[0], 10);
126
+ if (firstOctet >= 224) return true;
127
+ }
123
128
  const expanded = ip.includes("::") ? expandIPv6(ip) : ip.toLowerCase();
124
- if (ip === "::1" || ip === "::" || expanded === "0:0:0:0:0:0:0:1" || expanded === "0:0:0:0:0:0:0:0")
129
+ if (ip === "::1" || ip === "::" || expanded === "0000:0000:0000:0000:0000:0000:0000:0001" || expanded === "0000:0000:0000:0000:0000:0000:0000:0000")
125
130
  return true;
126
131
  if (expanded.startsWith("fc") || expanded.startsWith("fd")) return true;
127
- if (expanded.startsWith("fe80:") || expanded.startsWith("fe8") || expanded.startsWith("fe9") || expanded.startsWith("fea") || expanded.startsWith("feb"))
128
- return true;
129
- if (expanded.startsWith("fec0:") || expanded.startsWith("fec") || expanded.startsWith("fed") || expanded.startsWith("fee") || expanded.startsWith("fef"))
130
- return true;
132
+ if (/^fe[89ab][0-9a-f]:/.test(expanded)) return true;
133
+ if (/^fe[cdef][0-9a-f]:/.test(expanded)) return true;
131
134
  if (expanded.startsWith("ff")) return true;
132
- if (expanded.startsWith("64:ff9b:")) return true;
135
+ if (expanded.startsWith("0064:ff9b:")) return true;
133
136
  if (expanded.startsWith("2002:")) return true;
134
- if (expanded.startsWith("2001:db8:")) return true;
135
- if (expanded.startsWith("2001:0:")) return true;
137
+ if (expanded.startsWith("2001:0db8:")) return true;
138
+ if (expanded.startsWith("2001:0000:")) return true;
136
139
  return false;
137
140
  }
138
141
 
@@ -182,6 +185,8 @@ var BLOCKED_HOSTNAME_SUFFIXES = [
182
185
  ".internal",
183
186
  ".local",
184
187
  ".localhost",
188
+ ".localdomain",
189
+ ".intranet",
185
190
  ".corp",
186
191
  ".home",
187
192
  ".lan"
@@ -202,13 +207,18 @@ async function resolveAndValidateHostname(hostname) {
202
207
  return normalizedIP;
203
208
  }
204
209
  try {
205
- const { address } = await lookup(hostname);
206
- if (isPrivateIP(address)) {
207
- throw new SSRFError(
208
- `Blocked request to "${hostname}" \u2014 resolves to private IP. Requests to internal networks are not allowed.`
209
- );
210
+ const results = await lookup(hostname, { all: true });
211
+ if (results.length === 0) {
212
+ throw new NetworkError(`DNS lookup for "${hostname}" returned no results. Check the URL.`);
213
+ }
214
+ for (const { address } of results) {
215
+ if (isPrivateIP(address)) {
216
+ throw new SSRFError(
217
+ `Blocked request to "${hostname}" \u2014 resolves to private IP. Requests to internal networks are not allowed.`
218
+ );
219
+ }
210
220
  }
211
- return address;
221
+ return results[0].address;
212
222
  } catch (err) {
213
223
  if (err instanceof SSRFError) throw err;
214
224
  if (err instanceof Error) {
@@ -233,7 +243,7 @@ async function validateUrl(url) {
233
243
  `Unsupported protocol "${parsed.protocol}". Only HTTP and HTTPS are supported.`
234
244
  );
235
245
  }
236
- const hostname = parsed.hostname.toLowerCase();
246
+ const hostname = parsed.hostname.toLowerCase().replace(/\.$/, "");
237
247
  if (BLOCKED_HOSTNAMES.has(hostname)) {
238
248
  throw new SSRFError(
239
249
  `Blocked request to "${hostname}". Requests to internal hosts are not allowed.`
@@ -248,6 +258,20 @@ async function validateUrl(url) {
248
258
  }
249
259
  return resolveAndValidateHostname(hostname);
250
260
  }
261
+ function extractCharset(headers) {
262
+ const raw = headers["content-type"];
263
+ const contentType = typeof raw === "string" ? raw : "";
264
+ const match = contentType.match(/charset\s*=\s*["']?([^"';,\s]+)["']?/i);
265
+ return match ? match[1].toLowerCase() : null;
266
+ }
267
+ function decodeBody(buffer, charset) {
268
+ const encoding = charset || "utf-8";
269
+ try {
270
+ return new TextDecoder(encoding).decode(buffer);
271
+ } catch {
272
+ return buffer.toString("utf-8");
273
+ }
274
+ }
251
275
  var HTML_CONTENT_TYPES = [
252
276
  "text/html",
253
277
  "application/xhtml+xml",
@@ -282,11 +306,14 @@ function pinnedRequest(url, resolvedIP, timeout) {
282
306
  const parsed = new URL(url);
283
307
  const isHttps = parsed.protocol === "https:";
284
308
  const requestFn = isHttps ? httpsRequest : httpRequest;
309
+ const abortController = new AbortController();
310
+ const totalTimer = setTimeout(() => abortController.abort(), timeout);
285
311
  const req = requestFn(
286
312
  url,
287
313
  {
288
314
  method: "GET",
289
315
  timeout,
316
+ signal: abortController.signal,
290
317
  lookup: createPinnedLookup(resolvedIP),
291
318
  headers: {
292
319
  "User-Agent": "Mozilla/5.0 (compatible; web-to-markdown/0.1; +https://github.com/nidhi-singh02/mark-it-down)",
@@ -300,6 +327,7 @@ function pinnedRequest(url, resolvedIP, timeout) {
300
327
  res.on("data", (chunk) => {
301
328
  totalBytes += chunk.length;
302
329
  if (totalBytes > MAX_RESPONSE_SIZE) {
330
+ clearTimeout(totalTimer);
303
331
  req.destroy();
304
332
  reject(
305
333
  new ContentError(
@@ -311,24 +339,36 @@ function pinnedRequest(url, resolvedIP, timeout) {
311
339
  chunks.push(chunk);
312
340
  });
313
341
  res.on("end", () => {
342
+ clearTimeout(totalTimer);
314
343
  const bodyBuffer = Buffer.concat(chunks);
344
+ const responseHeaders = res.headers;
345
+ const charset = extractCharset(responseHeaders);
315
346
  resolve2({
316
347
  status: res.statusCode || 0,
317
- headers: res.headers,
318
- body: bodyBuffer.toString("utf-8"),
348
+ headers: responseHeaders,
349
+ body: decodeBody(bodyBuffer, charset),
319
350
  bodyBuffer,
320
351
  responseUrl: url
321
352
  });
322
353
  });
323
- res.on("error", reject);
354
+ res.on("error", (err) => {
355
+ clearTimeout(totalTimer);
356
+ reject(err);
357
+ });
324
358
  }
325
359
  );
326
360
  req.on("timeout", () => {
361
+ clearTimeout(totalTimer);
327
362
  req.destroy();
328
363
  reject(new NetworkError("Request timed out."));
329
364
  });
330
365
  req.on("error", (err) => {
331
- reject(new NetworkError(`Request failed: ${err.message}`, void 0, { cause: err }));
366
+ clearTimeout(totalTimer);
367
+ if (abortController.signal.aborted) {
368
+ reject(new NetworkError("Request timed out (total deadline exceeded)."));
369
+ } else {
370
+ reject(new NetworkError(`Request failed: ${err.message}`, void 0, { cause: err }));
371
+ }
332
372
  });
333
373
  req.end();
334
374
  });
@@ -381,24 +421,47 @@ async function fetchWithBrowser(url, resolvedIP, timeout) {
381
421
  );
382
422
  }
383
423
  const parsed = new URL(url);
384
- const hostResolverRule = `MAP ${parsed.hostname} ${resolvedIP}`;
424
+ const pinnedAddr = resolvedIP.includes(":") ? `[${resolvedIP}]` : resolvedIP;
425
+ const hostResolverRule = `MAP ${parsed.hostname} ${pinnedAddr}`;
426
+ const chromiumArgs = [
427
+ "--disable-features=WebSockets",
428
+ `--host-resolver-rules=${hostResolverRule}`,
429
+ // Avoid /dev/shm exhaustion in Docker containers (default 64MB)
430
+ "--disable-dev-shm-usage"
431
+ ];
432
+ if (process.platform === "linux" && process.getuid?.() === 0) {
433
+ chromiumArgs.push("--no-sandbox");
434
+ }
385
435
  const browser = await playwright.chromium.launch({
386
436
  headless: true,
387
- args: ["--disable-websockets", `--host-resolver-rules=${hostResolverRule}`]
437
+ args: chromiumArgs
388
438
  });
389
439
  try {
390
440
  const context = await browser.newContext({
391
441
  userAgent: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
392
442
  });
393
443
  const page = await context.newPage();
444
+ const initialHostname = parsed.hostname;
394
445
  await context.route("**/*", async (route) => {
395
446
  const requestUrl = route.request().url();
396
447
  try {
397
- const parsed2 = new URL(requestUrl);
398
- if (parsed2.protocol === "http:" || parsed2.protocol === "https:") {
399
- await validateUrl(requestUrl);
448
+ const reqParsed = new URL(requestUrl);
449
+ if (reqParsed.protocol !== "http:" && reqParsed.protocol !== "https:") {
450
+ await route.continue();
451
+ return;
452
+ }
453
+ const subResolvedIP = await validateUrl(requestUrl);
454
+ if (reqParsed.hostname === initialHostname) {
455
+ await route.continue();
456
+ return;
400
457
  }
401
- await route.continue();
458
+ const response = await pinnedRequest(requestUrl, subResolvedIP, safTimeout);
459
+ const contentType = typeof response.headers["content-type"] === "string" ? response.headers["content-type"] : "application/octet-stream";
460
+ await route.fulfill({
461
+ status: response.status,
462
+ contentType,
463
+ body: response.bodyBuffer
464
+ });
402
465
  } catch {
403
466
  await route.abort("blockedbyclient");
404
467
  }
@@ -416,7 +479,10 @@ async function fetchWithBrowser(url, resolvedIP, timeout) {
416
479
  }
417
480
  return { html, finalUrl: page.url() };
418
481
  } finally {
419
- await browser.close();
482
+ try {
483
+ await browser.close();
484
+ } catch {
485
+ }
420
486
  }
421
487
  }
422
488
  async function fetchRawText(url, timeout) {
@@ -580,7 +646,7 @@ function extractMdx(html, url) {
580
646
  return extractMintlifyMdx(html, url);
581
647
  }
582
648
  function processRawMdx(mdx, url) {
583
- const { body, metadata } = parseFrontmatter(mdx);
649
+ const { body, metadata } = parseFrontmatter(mdx.replace(/\r\n/g, "\n"));
584
650
  let markdown = stripMdxComponents(body);
585
651
  markdown = resolveRelativeUrls(markdown, url);
586
652
  markdown = markdown.replace(/\n{3,}/g, "\n\n");
@@ -1497,7 +1563,8 @@ function getPackageVersion() {
1497
1563
  async function validateOutputPath(outputPath) {
1498
1564
  const resolved = resolve(outputPath);
1499
1565
  const cwd = process.cwd();
1500
- if (!resolved.startsWith(cwd + "/") && resolved !== cwd) {
1566
+ const cwdPrefix = cwd.endsWith(sep) ? cwd : cwd + sep;
1567
+ if (!resolved.startsWith(cwdPrefix) && resolved !== cwd) {
1501
1568
  throw new Error(
1502
1569
  `Output path "${outputPath}" resolves outside the current directory.
1503
1570
  Resolved to: ${resolved}
@@ -1507,17 +1574,15 @@ For safety, output files must be within the working directory.`
1507
1574
  let checkPath = resolved;
1508
1575
  while (checkPath !== cwd && checkPath !== dirname(checkPath)) {
1509
1576
  if (existsSync(checkPath)) {
1510
- const stat = lstatSync(checkPath);
1511
- if (stat.isSymbolicLink()) {
1512
- const realTarget = await realpath(checkPath);
1513
- const realCwd = await realpath(cwd);
1514
- if (!realTarget.startsWith(realCwd + "/") && realTarget !== realCwd) {
1515
- throw new Error(
1516
- `Output path "${outputPath}" follows a symlink outside the current directory.
1517
- Symlink "${checkPath}" points to "${realTarget}".
1518
- For safety, output files must not escape the working directory via symlinks.`
1519
- );
1520
- }
1577
+ const realTarget = await realpath(checkPath);
1578
+ const realCwd = await realpath(cwd);
1579
+ const realCwdPrefix = realCwd.endsWith(sep) ? realCwd : realCwd + sep;
1580
+ if (!realTarget.startsWith(realCwdPrefix) && realTarget !== realCwd) {
1581
+ throw new Error(
1582
+ `Output path "${outputPath}" resolves outside the current directory.
1583
+ "${checkPath}" resolves to "${realTarget}".
1584
+ For safety, output files must not escape the working directory via symlinks or junctions.`
1585
+ );
1521
1586
  }
1522
1587
  break;
1523
1588
  }
@@ -1583,6 +1648,9 @@ Received ${signal}, shutting down\u2026
1583
1648
  };
1584
1649
  process.on("SIGINT", onSignal);
1585
1650
  process.on("SIGTERM", onSignal);
1651
+ if (process.platform === "win32") {
1652
+ process.on("SIGBREAK", onSignal);
1653
+ }
1586
1654
  await program.parseAsync(process.argv);
1587
1655
  }
1588
1656
 
package/dist/index.js CHANGED
@@ -38,10 +38,10 @@ function expandIPv6(ip) {
38
38
  } else {
39
39
  groups = ip.split(":");
40
40
  }
41
- return groups.map((g) => g.toLowerCase()).join(":");
41
+ return groups.map((g) => g.toLowerCase().padStart(4, "0")).join(":");
42
42
  }
43
43
  function normalizeIP(hostname) {
44
- const bare = hostname.replace(/^\[|\]$/g, "");
44
+ const bare = hostname.replace(/^\[|\]$/g, "").replace(/%.*$/, "");
45
45
  if (isIP(bare) === 4) return bare;
46
46
  if (isIP(bare) === 6) {
47
47
  const ffmpDotted = bare.match(/^::ffff:(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})$/i);
@@ -53,7 +53,9 @@ function normalizeIP(hostname) {
53
53
  return `${high >> 8 & 255}.${high & 255}.${low >> 8 & 255}.${low & 255}`;
54
54
  }
55
55
  const fullFfmpHex = expandIPv6(bare);
56
- const fullFfmpMatch = fullFfmpHex.match(/^0:0:0:0:0:ffff:([0-9a-f]{1,4}):([0-9a-f]{1,4})$/i);
56
+ const fullFfmpMatch = fullFfmpHex.match(
57
+ /^0000:0000:0000:0000:0000:ffff:([0-9a-f]{4}):([0-9a-f]{4})$/i
58
+ );
57
59
  if (fullFfmpMatch) {
58
60
  const high = parseInt(fullFfmpMatch[1], 16);
59
61
  const low = parseInt(fullFfmpMatch[2], 16);
@@ -110,20 +112,21 @@ function isPrivateIP(ip) {
110
112
  if (ip.startsWith("198.51.100.")) return true;
111
113
  if (ip.startsWith("203.0.113.")) return true;
112
114
  if (/^198\.(18|19)\./.test(ip)) return true;
113
- if (/^24[0-9]\./.test(ip) || ip.startsWith("255.")) return true;
115
+ if (ip.includes(".") && !ip.includes(":")) {
116
+ const firstOctet = parseInt(ip.split(".")[0], 10);
117
+ if (firstOctet >= 224) return true;
118
+ }
114
119
  const expanded = ip.includes("::") ? expandIPv6(ip) : ip.toLowerCase();
115
- if (ip === "::1" || ip === "::" || expanded === "0:0:0:0:0:0:0:1" || expanded === "0:0:0:0:0:0:0:0")
120
+ if (ip === "::1" || ip === "::" || expanded === "0000:0000:0000:0000:0000:0000:0000:0001" || expanded === "0000:0000:0000:0000:0000:0000:0000:0000")
116
121
  return true;
117
122
  if (expanded.startsWith("fc") || expanded.startsWith("fd")) return true;
118
- if (expanded.startsWith("fe80:") || expanded.startsWith("fe8") || expanded.startsWith("fe9") || expanded.startsWith("fea") || expanded.startsWith("feb"))
119
- return true;
120
- if (expanded.startsWith("fec0:") || expanded.startsWith("fec") || expanded.startsWith("fed") || expanded.startsWith("fee") || expanded.startsWith("fef"))
121
- return true;
123
+ if (/^fe[89ab][0-9a-f]:/.test(expanded)) return true;
124
+ if (/^fe[cdef][0-9a-f]:/.test(expanded)) return true;
122
125
  if (expanded.startsWith("ff")) return true;
123
- if (expanded.startsWith("64:ff9b:")) return true;
126
+ if (expanded.startsWith("0064:ff9b:")) return true;
124
127
  if (expanded.startsWith("2002:")) return true;
125
- if (expanded.startsWith("2001:db8:")) return true;
126
- if (expanded.startsWith("2001:0:")) return true;
128
+ if (expanded.startsWith("2001:0db8:")) return true;
129
+ if (expanded.startsWith("2001:0000:")) return true;
127
130
  return false;
128
131
  }
129
132
 
@@ -173,6 +176,8 @@ var BLOCKED_HOSTNAME_SUFFIXES = [
173
176
  ".internal",
174
177
  ".local",
175
178
  ".localhost",
179
+ ".localdomain",
180
+ ".intranet",
176
181
  ".corp",
177
182
  ".home",
178
183
  ".lan"
@@ -193,13 +198,18 @@ async function resolveAndValidateHostname(hostname) {
193
198
  return normalizedIP;
194
199
  }
195
200
  try {
196
- const { address } = await lookup(hostname);
197
- if (isPrivateIP(address)) {
198
- throw new SSRFError(
199
- `Blocked request to "${hostname}" \u2014 resolves to private IP. Requests to internal networks are not allowed.`
200
- );
201
+ const results = await lookup(hostname, { all: true });
202
+ if (results.length === 0) {
203
+ throw new NetworkError(`DNS lookup for "${hostname}" returned no results. Check the URL.`);
201
204
  }
202
- return address;
205
+ for (const { address } of results) {
206
+ if (isPrivateIP(address)) {
207
+ throw new SSRFError(
208
+ `Blocked request to "${hostname}" \u2014 resolves to private IP. Requests to internal networks are not allowed.`
209
+ );
210
+ }
211
+ }
212
+ return results[0].address;
203
213
  } catch (err) {
204
214
  if (err instanceof SSRFError) throw err;
205
215
  if (err instanceof Error) {
@@ -224,7 +234,7 @@ async function validateUrl(url) {
224
234
  `Unsupported protocol "${parsed.protocol}". Only HTTP and HTTPS are supported.`
225
235
  );
226
236
  }
227
- const hostname = parsed.hostname.toLowerCase();
237
+ const hostname = parsed.hostname.toLowerCase().replace(/\.$/, "");
228
238
  if (BLOCKED_HOSTNAMES.has(hostname)) {
229
239
  throw new SSRFError(
230
240
  `Blocked request to "${hostname}". Requests to internal hosts are not allowed.`
@@ -239,6 +249,20 @@ async function validateUrl(url) {
239
249
  }
240
250
  return resolveAndValidateHostname(hostname);
241
251
  }
252
+ function extractCharset(headers) {
253
+ const raw = headers["content-type"];
254
+ const contentType = typeof raw === "string" ? raw : "";
255
+ const match = contentType.match(/charset\s*=\s*["']?([^"';,\s]+)["']?/i);
256
+ return match ? match[1].toLowerCase() : null;
257
+ }
258
+ function decodeBody(buffer, charset) {
259
+ const encoding = charset || "utf-8";
260
+ try {
261
+ return new TextDecoder(encoding).decode(buffer);
262
+ } catch {
263
+ return buffer.toString("utf-8");
264
+ }
265
+ }
242
266
  var HTML_CONTENT_TYPES = [
243
267
  "text/html",
244
268
  "application/xhtml+xml",
@@ -273,11 +297,14 @@ function pinnedRequest(url, resolvedIP, timeout) {
273
297
  const parsed = new URL(url);
274
298
  const isHttps = parsed.protocol === "https:";
275
299
  const requestFn = isHttps ? httpsRequest : httpRequest;
300
+ const abortController = new AbortController();
301
+ const totalTimer = setTimeout(() => abortController.abort(), timeout);
276
302
  const req = requestFn(
277
303
  url,
278
304
  {
279
305
  method: "GET",
280
306
  timeout,
307
+ signal: abortController.signal,
281
308
  lookup: createPinnedLookup(resolvedIP),
282
309
  headers: {
283
310
  "User-Agent": "Mozilla/5.0 (compatible; web-to-markdown/0.1; +https://github.com/nidhi-singh02/mark-it-down)",
@@ -291,6 +318,7 @@ function pinnedRequest(url, resolvedIP, timeout) {
291
318
  res.on("data", (chunk) => {
292
319
  totalBytes += chunk.length;
293
320
  if (totalBytes > MAX_RESPONSE_SIZE) {
321
+ clearTimeout(totalTimer);
294
322
  req.destroy();
295
323
  reject(
296
324
  new ContentError(
@@ -302,24 +330,36 @@ function pinnedRequest(url, resolvedIP, timeout) {
302
330
  chunks.push(chunk);
303
331
  });
304
332
  res.on("end", () => {
333
+ clearTimeout(totalTimer);
305
334
  const bodyBuffer = Buffer.concat(chunks);
335
+ const responseHeaders = res.headers;
336
+ const charset = extractCharset(responseHeaders);
306
337
  resolve({
307
338
  status: res.statusCode || 0,
308
- headers: res.headers,
309
- body: bodyBuffer.toString("utf-8"),
339
+ headers: responseHeaders,
340
+ body: decodeBody(bodyBuffer, charset),
310
341
  bodyBuffer,
311
342
  responseUrl: url
312
343
  });
313
344
  });
314
- res.on("error", reject);
345
+ res.on("error", (err) => {
346
+ clearTimeout(totalTimer);
347
+ reject(err);
348
+ });
315
349
  }
316
350
  );
317
351
  req.on("timeout", () => {
352
+ clearTimeout(totalTimer);
318
353
  req.destroy();
319
354
  reject(new NetworkError("Request timed out."));
320
355
  });
321
356
  req.on("error", (err) => {
322
- reject(new NetworkError(`Request failed: ${err.message}`, void 0, { cause: err }));
357
+ clearTimeout(totalTimer);
358
+ if (abortController.signal.aborted) {
359
+ reject(new NetworkError("Request timed out (total deadline exceeded)."));
360
+ } else {
361
+ reject(new NetworkError(`Request failed: ${err.message}`, void 0, { cause: err }));
362
+ }
323
363
  });
324
364
  req.end();
325
365
  });
@@ -372,24 +412,47 @@ async function fetchWithBrowser(url, resolvedIP, timeout) {
372
412
  );
373
413
  }
374
414
  const parsed = new URL(url);
375
- const hostResolverRule = `MAP ${parsed.hostname} ${resolvedIP}`;
415
+ const pinnedAddr = resolvedIP.includes(":") ? `[${resolvedIP}]` : resolvedIP;
416
+ const hostResolverRule = `MAP ${parsed.hostname} ${pinnedAddr}`;
417
+ const chromiumArgs = [
418
+ "--disable-features=WebSockets",
419
+ `--host-resolver-rules=${hostResolverRule}`,
420
+ // Avoid /dev/shm exhaustion in Docker containers (default 64MB)
421
+ "--disable-dev-shm-usage"
422
+ ];
423
+ if (process.platform === "linux" && process.getuid?.() === 0) {
424
+ chromiumArgs.push("--no-sandbox");
425
+ }
376
426
  const browser = await playwright.chromium.launch({
377
427
  headless: true,
378
- args: ["--disable-websockets", `--host-resolver-rules=${hostResolverRule}`]
428
+ args: chromiumArgs
379
429
  });
380
430
  try {
381
431
  const context = await browser.newContext({
382
432
  userAgent: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
383
433
  });
384
434
  const page = await context.newPage();
435
+ const initialHostname = parsed.hostname;
385
436
  await context.route("**/*", async (route) => {
386
437
  const requestUrl = route.request().url();
387
438
  try {
388
- const parsed2 = new URL(requestUrl);
389
- if (parsed2.protocol === "http:" || parsed2.protocol === "https:") {
390
- await validateUrl(requestUrl);
439
+ const reqParsed = new URL(requestUrl);
440
+ if (reqParsed.protocol !== "http:" && reqParsed.protocol !== "https:") {
441
+ await route.continue();
442
+ return;
443
+ }
444
+ const subResolvedIP = await validateUrl(requestUrl);
445
+ if (reqParsed.hostname === initialHostname) {
446
+ await route.continue();
447
+ return;
391
448
  }
392
- await route.continue();
449
+ const response = await pinnedRequest(requestUrl, subResolvedIP, safTimeout);
450
+ const contentType = typeof response.headers["content-type"] === "string" ? response.headers["content-type"] : "application/octet-stream";
451
+ await route.fulfill({
452
+ status: response.status,
453
+ contentType,
454
+ body: response.bodyBuffer
455
+ });
393
456
  } catch {
394
457
  await route.abort("blockedbyclient");
395
458
  }
@@ -407,7 +470,10 @@ async function fetchWithBrowser(url, resolvedIP, timeout) {
407
470
  }
408
471
  return { html, finalUrl: page.url() };
409
472
  } finally {
410
- await browser.close();
473
+ try {
474
+ await browser.close();
475
+ } catch {
476
+ }
411
477
  }
412
478
  }
413
479
  async function fetchRawText(url, timeout) {
@@ -571,7 +637,7 @@ function extractMdx(html, url) {
571
637
  return extractMintlifyMdx(html, url);
572
638
  }
573
639
  function processRawMdx(mdx, url) {
574
- const { body, metadata } = parseFrontmatter(mdx);
640
+ const { body, metadata } = parseFrontmatter(mdx.replace(/\r\n/g, "\n"));
575
641
  let markdown = stripMdxComponents(body);
576
642
  markdown = resolveRelativeUrls(markdown, url);
577
643
  markdown = markdown.replace(/\n{3,}/g, "\n\n");
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "web-to-markdown",
3
- "version": "0.1.0",
3
+ "version": "0.1.1",
4
4
  "description": "Convert any web page to clean Markdown. Built for developers and LLM pipelines.",
5
5
  "author": "Nidhi Singh",
6
6
  "repository": {
@@ -66,6 +66,7 @@
66
66
  "@types/node": "^22.0.0",
67
67
  "@types/turndown": "^5.0.5",
68
68
  "eslint": "^10.0.2",
69
+ "js-tiktoken": "^1.0.21",
69
70
  "prettier": "^3.8.1",
70
71
  "tsup": "^8.5.0",
71
72
  "typescript": "^5.7.0",