into-md 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/CLAUDE.md +251 -0
- package/.claude/settings.json +15 -0
- package/.claude/settings.local.json +9 -0
- package/.cursor/hooks.json +10 -0
- package/.vscode/settings.json +53 -0
- package/AGENTS.md +284 -0
- package/CLAUDE.md +111 -0
- package/GEMINI.md +123 -0
- package/README.md +133 -0
- package/biome.jsonc +4 -0
- package/bun.lock +413 -0
- package/dist/index.d.mts +3 -0
- package/dist/index.mjs +446 -0
- package/dist/index.mjs.map +1 -0
- package/docs/SPEC.md +201 -0
- package/package.json +39 -0
- package/src/cache.ts +79 -0
- package/src/converter.ts +96 -0
- package/src/extractor.ts +85 -0
- package/src/fetcher.ts +236 -0
- package/src/images.ts +27 -0
- package/src/index.ts +143 -0
- package/src/metadata.ts +30 -0
- package/src/tables.ts +80 -0
- package/src/types/jsdom.d.ts +10 -0
- package/src/utils.ts +28 -0
- package/tsconfig.json +29 -0
- package/tsdown.config.ts +14 -0
package/dist/index.mjs
ADDED
|
@@ -0,0 +1,446 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
import { mkdir, readFile, stat, writeFile } from "node:fs/promises";
|
|
4
|
+
import { Command } from "commander";
|
|
5
|
+
import { load } from "cheerio";
|
|
6
|
+
import TurndownService from "turndown";
|
|
7
|
+
import { Readability } from "@mozilla/readability";
|
|
8
|
+
import { JSDOM } from "jsdom";
|
|
9
|
+
import { readFileSync } from "node:fs";
|
|
10
|
+
import { basename, dirname, join } from "node:path";
|
|
11
|
+
import { createHash } from "node:crypto";
|
|
12
|
+
|
|
13
|
+
//#region src/utils.ts
|
|
14
|
+
/**
|
|
15
|
+
* Converts a relative URL to an absolute URL using the provided base URL.
|
|
16
|
+
* Returns the original URL if it cannot be parsed.
|
|
17
|
+
*/
|
|
18
|
+
const toAbsoluteUrl = (url, baseUrl) => {
|
|
19
|
+
if (!url) return;
|
|
20
|
+
try {
|
|
21
|
+
return new URL(url, baseUrl).toString();
|
|
22
|
+
} catch {
|
|
23
|
+
return url;
|
|
24
|
+
}
|
|
25
|
+
};
|
|
26
|
+
/**
|
|
27
|
+
* Extracts the inner HTML from the body element, or falls back to root HTML.
|
|
28
|
+
* Common pattern used across multiple cheerio-based transformations.
|
|
29
|
+
*/
|
|
30
|
+
const getBodyHtml = ($) => {
|
|
31
|
+
const body = $("body");
|
|
32
|
+
return body.length ? body.html() ?? "" : $.root().html() ?? "";
|
|
33
|
+
};
|
|
34
|
+
|
|
35
|
+
//#endregion
|
|
36
|
+
//#region src/converter.ts
|
|
37
|
+
function prepareDom(html, baseUrl) {
|
|
38
|
+
const $ = load(html);
|
|
39
|
+
for (const el of $("a[href]").toArray()) {
|
|
40
|
+
const $el = $(el);
|
|
41
|
+
const absolute = toAbsoluteUrl($el.attr("href"), baseUrl);
|
|
42
|
+
if (absolute) $el.attr("href", absolute);
|
|
43
|
+
}
|
|
44
|
+
for (const el of $("img[src]").toArray()) {
|
|
45
|
+
const $el = $(el);
|
|
46
|
+
const absolute = toAbsoluteUrl($el.attr("src"), baseUrl);
|
|
47
|
+
if (absolute) $el.attr("src", absolute);
|
|
48
|
+
}
|
|
49
|
+
$("script, style").remove();
|
|
50
|
+
return getBodyHtml($);
|
|
51
|
+
}
|
|
52
|
+
function convertHtmlToMarkdown(html, options) {
|
|
53
|
+
const prepared = prepareDom(html, options.baseUrl);
|
|
54
|
+
const turndown = new TurndownService({
|
|
55
|
+
bulletListMarker: "-",
|
|
56
|
+
codeBlockStyle: "fenced",
|
|
57
|
+
headingStyle: "atx"
|
|
58
|
+
});
|
|
59
|
+
turndown.addRule("stripLinks", {
|
|
60
|
+
filter: "a",
|
|
61
|
+
replacement: (content, node) => {
|
|
62
|
+
if (options.stripLinks) return content;
|
|
63
|
+
const href = node.getAttribute("href");
|
|
64
|
+
if (!href) return content;
|
|
65
|
+
return `[${content}](${href})`;
|
|
66
|
+
}
|
|
67
|
+
});
|
|
68
|
+
turndown.addRule("imagesWithCaption", {
|
|
69
|
+
filter: "img",
|
|
70
|
+
replacement: (_, node) => {
|
|
71
|
+
const element = node;
|
|
72
|
+
const src = element.getAttribute("src") ?? "";
|
|
73
|
+
const alt = element.getAttribute("alt") ?? "";
|
|
74
|
+
const caption = element.getAttribute("data-into-md-caption");
|
|
75
|
+
const imageLine = ``;
|
|
76
|
+
if (caption) return `${imageLine}\n*${caption}*`;
|
|
77
|
+
return imageLine;
|
|
78
|
+
}
|
|
79
|
+
});
|
|
80
|
+
turndown.addRule("tableJson", {
|
|
81
|
+
filter: (node) => node.nodeName === "PRE" && node.getAttribute("data-into-md-table") === "true",
|
|
82
|
+
replacement: (_content, node) => {
|
|
83
|
+
return `\`\`\`json\n${node.textContent?.trim() ?? ""}\n\`\`\``;
|
|
84
|
+
}
|
|
85
|
+
});
|
|
86
|
+
turndown.addRule("embeds", {
|
|
87
|
+
filter: [
|
|
88
|
+
"iframe",
|
|
89
|
+
"embed",
|
|
90
|
+
"video"
|
|
91
|
+
],
|
|
92
|
+
replacement: (_, node) => {
|
|
93
|
+
const src = node.getAttribute("src") ?? "";
|
|
94
|
+
if (!src) return "";
|
|
95
|
+
return `[Embedded content: ${src}]`;
|
|
96
|
+
}
|
|
97
|
+
});
|
|
98
|
+
return turndown.turndown(prepared);
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
//#endregion
|
|
102
|
+
//#region src/extractor.ts
|
|
103
|
+
function removeNodes(document, selectors) {
|
|
104
|
+
for (const selector of selectors) for (const node of Array.from(document.querySelectorAll(selector))) node.remove();
|
|
105
|
+
}
|
|
106
|
+
function extractMetadata(document, source) {
|
|
107
|
+
const title = document.querySelector("title")?.textContent ?? document.querySelector("meta[property=\"og:title\"]")?.getAttribute("content") ?? void 0;
|
|
108
|
+
const description = document.querySelector("meta[name=\"description\"]")?.getAttribute("content") ?? document.querySelector("meta[property=\"og:description\"]")?.getAttribute("content") ?? void 0;
|
|
109
|
+
return {
|
|
110
|
+
author: document.querySelector("meta[name=\"author\"]")?.getAttribute("content") ?? document.querySelector("meta[property=\"article:author\"]")?.getAttribute("content") ?? void 0,
|
|
111
|
+
description,
|
|
112
|
+
source,
|
|
113
|
+
title: title ?? void 0
|
|
114
|
+
};
|
|
115
|
+
}
|
|
116
|
+
function extractContent(html, { raw = false, excludeSelectors = [], baseUrl }) {
|
|
117
|
+
const { document } = new JSDOM(html, { url: baseUrl }).window;
|
|
118
|
+
if (excludeSelectors.length) removeNodes(document, excludeSelectors);
|
|
119
|
+
if (raw) {
|
|
120
|
+
const metadata$1 = extractMetadata(document, baseUrl);
|
|
121
|
+
return {
|
|
122
|
+
html: document.documentElement.outerHTML,
|
|
123
|
+
metadata: metadata$1
|
|
124
|
+
};
|
|
125
|
+
}
|
|
126
|
+
const article = new Readability(document.cloneNode(true)).parse();
|
|
127
|
+
const contentHtml = article?.content ?? document.querySelector("body")?.innerHTML ?? "";
|
|
128
|
+
const metadata = extractMetadata(document, baseUrl);
|
|
129
|
+
if (article?.title && !metadata.title) metadata.title = article.title;
|
|
130
|
+
if (article?.byline && !metadata.author) metadata.author = article.byline;
|
|
131
|
+
return {
|
|
132
|
+
html: contentHtml,
|
|
133
|
+
metadata
|
|
134
|
+
};
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
//#endregion
|
|
138
|
+
//#region src/cache.ts
|
|
139
|
+
const defaultCacheDir = join(process.env.HOME ?? process.cwd(), ".cache", "into-md");
|
|
140
|
+
const DEFAULT_TTL_MS = 3600 * 1e3;
|
|
141
|
+
const buildCachePath = (url, cacheDir = defaultCacheDir) => {
|
|
142
|
+
return join(cacheDir, `${createHash("sha256").update(url).digest("hex")}.json`);
|
|
143
|
+
};
|
|
144
|
+
async function readFromCache(url, options) {
|
|
145
|
+
const { enabled = true, ttlMs = DEFAULT_TTL_MS, cacheDir = defaultCacheDir } = options ?? {};
|
|
146
|
+
if (!enabled) return null;
|
|
147
|
+
const target = buildCachePath(url, cacheDir);
|
|
148
|
+
try {
|
|
149
|
+
const [file, info] = await Promise.all([readFile(target, "utf8"), stat(target)]);
|
|
150
|
+
const payload = JSON.parse(file);
|
|
151
|
+
if (!(info.mtimeMs + ttlMs > Date.now())) return null;
|
|
152
|
+
if (payload.url !== url) return null;
|
|
153
|
+
return payload;
|
|
154
|
+
} catch {
|
|
155
|
+
return null;
|
|
156
|
+
}
|
|
157
|
+
}
|
|
158
|
+
async function writeToCache(url, content, options) {
|
|
159
|
+
const { enabled = true, cacheDir = defaultCacheDir } = options ?? {};
|
|
160
|
+
if (!enabled) return;
|
|
161
|
+
const target = buildCachePath(url, cacheDir);
|
|
162
|
+
await mkdir(dirname(target), { recursive: true });
|
|
163
|
+
const payload = {
|
|
164
|
+
content,
|
|
165
|
+
fetchedAt: Date.now(),
|
|
166
|
+
url
|
|
167
|
+
};
|
|
168
|
+
await writeFile(target, JSON.stringify(payload, null, 2), "utf8");
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
//#endregion
|
|
172
|
+
//#region src/fetcher.ts
|
|
173
|
+
const DEFAULT_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0 Safari/537.36";
|
|
174
|
+
const DEFAULT_TIMEOUT_MS = 3e4;
|
|
175
|
+
const logVerbose = (message, verbose) => {
|
|
176
|
+
if (verbose) console.error(message);
|
|
177
|
+
};
|
|
178
|
+
function parseNetscapeCookieLine(line) {
|
|
179
|
+
const trimmed = line.trim();
|
|
180
|
+
if (!trimmed || trimmed.startsWith("#")) return null;
|
|
181
|
+
const parts = trimmed.split(" ");
|
|
182
|
+
if (parts.length < 7) return null;
|
|
183
|
+
const domain = parts[0];
|
|
184
|
+
const path = parts[2];
|
|
185
|
+
const secureFlag = parts[3];
|
|
186
|
+
const expires = parts[4];
|
|
187
|
+
const name = parts[5];
|
|
188
|
+
const value = parts[6];
|
|
189
|
+
if (!(domain && path && secureFlag && expires && name && value)) return null;
|
|
190
|
+
return {
|
|
191
|
+
headerPair: `${name}=${value}`,
|
|
192
|
+
record: {
|
|
193
|
+
domain,
|
|
194
|
+
expires: Number(expires),
|
|
195
|
+
name,
|
|
196
|
+
path,
|
|
197
|
+
secure: secureFlag.toLowerCase() === "true",
|
|
198
|
+
value
|
|
199
|
+
}
|
|
200
|
+
};
|
|
201
|
+
}
|
|
202
|
+
function parseCookiesFile(cookiesPath) {
|
|
203
|
+
if (!cookiesPath) return {
|
|
204
|
+
header: void 0,
|
|
205
|
+
playwrightCookies: []
|
|
206
|
+
};
|
|
207
|
+
let content;
|
|
208
|
+
try {
|
|
209
|
+
content = readFileSync(cookiesPath, "utf8");
|
|
210
|
+
} catch (error) {
|
|
211
|
+
throw new Error(`Unable to read cookies file "${basename(cookiesPath)}": ${String(error)}`, { cause: error });
|
|
212
|
+
}
|
|
213
|
+
const entries = [];
|
|
214
|
+
const headerPairs = [];
|
|
215
|
+
for (const line of content.split("\n")) {
|
|
216
|
+
const parsed = parseNetscapeCookieLine(line);
|
|
217
|
+
if (!parsed) continue;
|
|
218
|
+
entries.push(parsed.record);
|
|
219
|
+
headerPairs.push(parsed.headerPair);
|
|
220
|
+
}
|
|
221
|
+
return {
|
|
222
|
+
header: headerPairs.length ? headerPairs.join("; ") : void 0,
|
|
223
|
+
playwrightCookies: entries
|
|
224
|
+
};
|
|
225
|
+
}
|
|
226
|
+
async function fetchWithHttp(url, options) {
|
|
227
|
+
const controller = new AbortController();
|
|
228
|
+
const timeout = setTimeout(() => controller.abort(), options.timeoutMs ?? DEFAULT_TIMEOUT_MS);
|
|
229
|
+
const { header: cookiesHeader } = parseCookiesFile(options.cookiesPath);
|
|
230
|
+
const headers = new Headers({ "User-Agent": options.userAgent ?? DEFAULT_USER_AGENT });
|
|
231
|
+
if (cookiesHeader) headers.set("Cookie", cookiesHeader);
|
|
232
|
+
try {
|
|
233
|
+
const response = await fetch(url, {
|
|
234
|
+
headers,
|
|
235
|
+
method: "GET",
|
|
236
|
+
redirect: "follow",
|
|
237
|
+
signal: controller.signal
|
|
238
|
+
});
|
|
239
|
+
if (!response.ok) throw new Error(`Request failed with status ${response.status}. If blocked, try --user-agent.`);
|
|
240
|
+
const finalUrl = response.url;
|
|
241
|
+
const buffer = await response.arrayBuffer();
|
|
242
|
+
return {
|
|
243
|
+
finalUrl,
|
|
244
|
+
fromCache: false,
|
|
245
|
+
html: new TextDecoder(options.encoding).decode(buffer)
|
|
246
|
+
};
|
|
247
|
+
} catch (error) {
|
|
248
|
+
const prefix = error instanceof Error && error.name === "AbortError" ? "Request timed out" : "Request failed";
|
|
249
|
+
throw new Error(`${prefix}: ${String(error)}`, { cause: error });
|
|
250
|
+
} finally {
|
|
251
|
+
clearTimeout(timeout);
|
|
252
|
+
}
|
|
253
|
+
}
|
|
254
|
+
async function fetchWithBrowser(url, options) {
|
|
255
|
+
let playwright = null;
|
|
256
|
+
try {
|
|
257
|
+
playwright = await import("playwright");
|
|
258
|
+
} catch (error) {
|
|
259
|
+
throw new Error(`JS mode requested but playwright is not installed. Install it and retry. (${String(error)})`, { cause: error });
|
|
260
|
+
}
|
|
261
|
+
const { playwrightCookies } = parseCookiesFile(options.cookiesPath);
|
|
262
|
+
const browser = await playwright.chromium.launch({ headless: true });
|
|
263
|
+
const context = await browser.newContext({ userAgent: options.userAgent ?? DEFAULT_USER_AGENT });
|
|
264
|
+
if (playwrightCookies.length) await context.addCookies(playwrightCookies.map((cookie) => ({
|
|
265
|
+
...cookie,
|
|
266
|
+
httpOnly: false,
|
|
267
|
+
sameSite: "Lax"
|
|
268
|
+
})));
|
|
269
|
+
const page = await context.newPage();
|
|
270
|
+
await page.goto(url, {
|
|
271
|
+
timeout: options.timeoutMs ?? DEFAULT_TIMEOUT_MS,
|
|
272
|
+
waitUntil: "networkidle"
|
|
273
|
+
});
|
|
274
|
+
const html = await page.content();
|
|
275
|
+
const finalUrl = page.url();
|
|
276
|
+
await browser.close();
|
|
277
|
+
return {
|
|
278
|
+
finalUrl,
|
|
279
|
+
fromCache: false,
|
|
280
|
+
html
|
|
281
|
+
};
|
|
282
|
+
}
|
|
283
|
+
async function fetchPage(url, options) {
|
|
284
|
+
const cacheEnabled = !options.noCache;
|
|
285
|
+
if (cacheEnabled) {
|
|
286
|
+
const cached = await readFromCache(url, {
|
|
287
|
+
enabled: cacheEnabled,
|
|
288
|
+
...options.cache
|
|
289
|
+
});
|
|
290
|
+
if (cached) {
|
|
291
|
+
logVerbose("Cache hit", options.verbose);
|
|
292
|
+
return {
|
|
293
|
+
finalUrl: url,
|
|
294
|
+
fromCache: true,
|
|
295
|
+
html: cached.content
|
|
296
|
+
};
|
|
297
|
+
}
|
|
298
|
+
}
|
|
299
|
+
logVerbose(`Fetching ${url} ${options.useJs ? "(headless browser)" : "(http)"}`, options.verbose);
|
|
300
|
+
const result = options.useJs ? await fetchWithBrowser(url, options) : await fetchWithHttp(url, options);
|
|
301
|
+
if (cacheEnabled) await writeToCache(url, result.html, {
|
|
302
|
+
enabled: cacheEnabled,
|
|
303
|
+
...options.cache
|
|
304
|
+
});
|
|
305
|
+
return result;
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
//#endregion
|
|
309
|
+
//#region src/images.ts
|
|
310
|
+
function annotateImages(html, baseUrl) {
|
|
311
|
+
const $ = load(html);
|
|
312
|
+
for (const img of $("img").toArray()) {
|
|
313
|
+
const $img = $(img);
|
|
314
|
+
const absoluteSrc = toAbsoluteUrl($img.attr("src"), baseUrl);
|
|
315
|
+
if (absoluteSrc) $img.attr("src", absoluteSrc);
|
|
316
|
+
const caption = $img.closest("figure").find("figcaption").text().trim() || $img.attr("title")?.trim() || void 0;
|
|
317
|
+
if (caption) $img.attr("data-into-md-caption", caption);
|
|
318
|
+
}
|
|
319
|
+
return getBodyHtml($);
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
//#endregion
|
|
323
|
+
//#region src/metadata.ts
|
|
324
|
+
function buildFrontmatter(meta) {
|
|
325
|
+
const lines = ["---"];
|
|
326
|
+
if (meta.title) lines.push(`title: "${escapeFrontmatter(meta.title)}"`);
|
|
327
|
+
if (meta.description) lines.push(`description: "${escapeFrontmatter(meta.description)}"`);
|
|
328
|
+
if (meta.author) lines.push(`author: "${escapeFrontmatter(meta.author)}"`);
|
|
329
|
+
if (meta.date) lines.push(`date: "${escapeFrontmatter(meta.date)}"`);
|
|
330
|
+
lines.push(`source: "${escapeFrontmatter(meta.source)}"`);
|
|
331
|
+
lines.push("---");
|
|
332
|
+
return lines.join("\n");
|
|
333
|
+
}
|
|
334
|
+
function escapeFrontmatter(value) {
|
|
335
|
+
return value.replaceAll("\"", String.raw`\"`);
|
|
336
|
+
}
|
|
337
|
+
|
|
338
|
+
//#endregion
|
|
339
|
+
//#region src/tables.ts
|
|
340
|
+
function extractHeaders($table, $) {
|
|
341
|
+
const explicitHeaders = $table.find("thead th");
|
|
342
|
+
if (explicitHeaders.length) return explicitHeaders.toArray().map((th) => $(th).text().trim()).filter(Boolean);
|
|
343
|
+
const firstRowHeaders = $table.find("tr").first().find("th, td");
|
|
344
|
+
if (firstRowHeaders.length) return firstRowHeaders.toArray().map((cell, index) => $(cell).text().trim() || `Column ${index + 1}`);
|
|
345
|
+
return [];
|
|
346
|
+
}
|
|
347
|
+
function extractRows($table, headers, $) {
|
|
348
|
+
const rows = [];
|
|
349
|
+
const dataRows = $table.find("tbody tr").length > 0 ? $table.find("tbody tr") : $table.find("tr").slice(1);
|
|
350
|
+
for (const row of dataRows.toArray()) {
|
|
351
|
+
const cells = $(row).find("td, th");
|
|
352
|
+
if (!cells.length) continue;
|
|
353
|
+
const record = {};
|
|
354
|
+
for (const [cellIndex, cell] of cells.toArray().entries()) {
|
|
355
|
+
const key = headers[cellIndex] ?? `Column ${cellIndex + 1}`;
|
|
356
|
+
record[key] = $(cell).text().trim();
|
|
357
|
+
}
|
|
358
|
+
rows.push(record);
|
|
359
|
+
}
|
|
360
|
+
return rows;
|
|
361
|
+
}
|
|
362
|
+
function convertTablesToJson(html) {
|
|
363
|
+
const $ = load(html);
|
|
364
|
+
for (const table of $("table").toArray()) {
|
|
365
|
+
const $table = $(table);
|
|
366
|
+
const caption = $table.find("caption").first().text().trim() || void 0;
|
|
367
|
+
const headers = extractHeaders($table, $);
|
|
368
|
+
const json = {
|
|
369
|
+
caption,
|
|
370
|
+
headers,
|
|
371
|
+
rows: extractRows($table, headers, $)
|
|
372
|
+
};
|
|
373
|
+
const pre = $("<pre>").attr("data-into-md-table", "true").text(JSON.stringify(json, null, 2));
|
|
374
|
+
$table.replaceWith(pre);
|
|
375
|
+
}
|
|
376
|
+
return getBodyHtml($);
|
|
377
|
+
}
|
|
378
|
+
|
|
379
|
+
//#endregion
|
|
380
|
+
//#region src/index.ts
|
|
381
|
+
const DEFAULT_TIMEOUT = 3e4;
|
|
382
|
+
async function run(url, options) {
|
|
383
|
+
const selectors = options.exclude?.split(",").map((selector) => selector.trim()).filter(Boolean) ?? [];
|
|
384
|
+
if (options.verbose) console.error("Starting into-md…");
|
|
385
|
+
const fetchResult = await fetchPage(url, {
|
|
386
|
+
cookiesPath: options.cookies,
|
|
387
|
+
encoding: options.encoding,
|
|
388
|
+
noCache: options.noCache,
|
|
389
|
+
timeoutMs: options.timeout ?? DEFAULT_TIMEOUT,
|
|
390
|
+
useJs: options.js,
|
|
391
|
+
userAgent: options.userAgent,
|
|
392
|
+
verbose: options.verbose
|
|
393
|
+
});
|
|
394
|
+
const extracted = extractContent(fetchResult.html, {
|
|
395
|
+
baseUrl: fetchResult.finalUrl,
|
|
396
|
+
excludeSelectors: selectors,
|
|
397
|
+
raw: options.raw
|
|
398
|
+
});
|
|
399
|
+
let workingHtml = extracted.html;
|
|
400
|
+
workingHtml = convertTablesToJson(workingHtml);
|
|
401
|
+
workingHtml = annotateImages(workingHtml, fetchResult.finalUrl);
|
|
402
|
+
const markdown = convertHtmlToMarkdown(workingHtml, {
|
|
403
|
+
baseUrl: fetchResult.finalUrl,
|
|
404
|
+
stripLinks: options.stripLinks
|
|
405
|
+
});
|
|
406
|
+
const output = `${buildFrontmatter({
|
|
407
|
+
...extracted.metadata,
|
|
408
|
+
source: fetchResult.finalUrl
|
|
409
|
+
})}\n\n${markdown}`.trim();
|
|
410
|
+
if (options.output) {
|
|
411
|
+
await writeFile(options.output, output, "utf8");
|
|
412
|
+
if (options.verbose) console.error(`Saved to ${options.output}`);
|
|
413
|
+
} else console.log(output);
|
|
414
|
+
const size = Buffer.byteLength(output, "utf8");
|
|
415
|
+
if (size > 1e5) console.error(`Warning: Output is ${Math.round(size / 1024)}KB. Large documents may exceed LLM context limits.`);
|
|
416
|
+
}
|
|
417
|
+
function buildProgram() {
|
|
418
|
+
const program = new Command().name("into-md").description("Fetch a web page and convert its content to markdown.").argument("<url>", "URL to fetch").option("-o, --output <file>", "Write output to file instead of stdout").option("--js", "Use headless browser (Playwright) for JS-rendered content").option("--raw", "Skip content extraction, convert entire HTML").option("--cookies <file>", "Path to cookies file for authenticated requests").option("--user-agent <string>", "Custom User-Agent header").option("--encoding <encoding>", "Force character encoding (auto-detected by default)").option("--strip-links", "Remove hyperlinks, keep only anchor text").option("--exclude <selectors>", "CSS selectors to exclude (comma-separated)").option("--timeout <ms>", "Request timeout in milliseconds", `${DEFAULT_TIMEOUT}`).option("--no-cache", "Bypass response cache").option("-v, --verbose", "Show detailed progress information");
|
|
419
|
+
program.version("0.1.0");
|
|
420
|
+
return program;
|
|
421
|
+
}
|
|
422
|
+
async function main() {
|
|
423
|
+
const program = buildProgram();
|
|
424
|
+
program.parse(process.argv);
|
|
425
|
+
const [url] = program.args;
|
|
426
|
+
if (!url) {
|
|
427
|
+
program.help();
|
|
428
|
+
return;
|
|
429
|
+
}
|
|
430
|
+
const opts = program.opts();
|
|
431
|
+
try {
|
|
432
|
+
await run(url, {
|
|
433
|
+
...opts,
|
|
434
|
+
timeout: opts.timeout ? Number(opts.timeout) : DEFAULT_TIMEOUT
|
|
435
|
+
});
|
|
436
|
+
} catch (error) {
|
|
437
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
438
|
+
console.error(message);
|
|
439
|
+
process.exitCode = 1;
|
|
440
|
+
}
|
|
441
|
+
}
|
|
442
|
+
main();
|
|
443
|
+
|
|
444
|
+
//#endregion
|
|
445
|
+
export { };
|
|
446
|
+
//# sourceMappingURL=index.mjs.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.mjs","names":["metadata"],"sources":["../src/utils.ts","../src/converter.ts","../src/extractor.ts","../src/cache.ts","../src/fetcher.ts","../src/images.ts","../src/metadata.ts","../src/tables.ts","../src/index.ts"],"sourcesContent":["import type { CheerioAPI } from \"cheerio\";\n\n/**\n * Converts a relative URL to an absolute URL using the provided base URL.\n * Returns the original URL if it cannot be parsed.\n */\nexport const toAbsoluteUrl = (\n url: string | undefined,\n baseUrl: string\n): string | undefined => {\n if (!url) {\n return undefined;\n }\n try {\n return new URL(url, baseUrl).toString();\n } catch {\n return url;\n }\n};\n\n/**\n * Extracts the inner HTML from the body element, or falls back to root HTML.\n * Common pattern used across multiple cheerio-based transformations.\n */\nexport const getBodyHtml = ($: CheerioAPI): string => {\n const body = $(\"body\");\n return body.length ? (body.html() ?? \"\") : ($.root().html() ?? \"\");\n};\n","import { load } from \"cheerio\";\nimport TurndownService from \"turndown\";\n\nimport { getBodyHtml, toAbsoluteUrl } from \"./utils\";\n\nexport interface ConvertOptions {\n baseUrl: string;\n stripLinks?: boolean;\n}\n\nfunction prepareDom(html: string, baseUrl: string): string {\n const $ = load(html);\n\n for (const el of $(\"a[href]\").toArray()) {\n const $el = $(el);\n const absolute = toAbsoluteUrl($el.attr(\"href\"), baseUrl);\n if (absolute) {\n $el.attr(\"href\", absolute);\n }\n }\n\n for (const el of $(\"img[src]\").toArray()) {\n const $el = $(el);\n const absolute = toAbsoluteUrl($el.attr(\"src\"), baseUrl);\n if (absolute) {\n $el.attr(\"src\", absolute);\n }\n }\n\n $(\"script, style\").remove();\n return getBodyHtml($);\n}\n\nexport function convertHtmlToMarkdown(\n html: string,\n options: ConvertOptions\n): string {\n const prepared = prepareDom(html, options.baseUrl);\n const turndown = new TurndownService({\n bulletListMarker: \"-\",\n codeBlockStyle: \"fenced\",\n headingStyle: \"atx\",\n });\n\n turndown.addRule(\"stripLinks\", {\n filter: \"a\",\n replacement: (content, node) => {\n if (options.stripLinks) {\n return content;\n }\n const href = (node as HTMLElement).getAttribute(\"href\");\n if (!href) {\n return content;\n }\n return `[${content}](${href})`;\n },\n });\n\n turndown.addRule(\"imagesWithCaption\", {\n filter: \"img\",\n replacement: (_, node) => {\n const element = node as HTMLElement;\n const src = element.getAttribute(\"src\") ?? \"\";\n const alt = element.getAttribute(\"alt\") ?? \"\";\n const caption = element.getAttribute(\"data-into-md-caption\");\n const imageLine = ``;\n if (caption) {\n return `${imageLine}\\n*${caption}*`;\n }\n return imageLine;\n },\n });\n\n turndown.addRule(\"tableJson\", {\n filter: (node) =>\n node.nodeName === \"PRE\" &&\n (node as HTMLElement).getAttribute(\"data-into-md-table\") === \"true\",\n replacement: (_content, node) => {\n const text = (node as HTMLElement).textContent?.trim() ?? \"\";\n return `\\`\\`\\`json\\n${text}\\n\\`\\`\\``;\n },\n });\n\n turndown.addRule(\"embeds\", {\n filter: [\"iframe\", \"embed\", \"video\"],\n replacement: (_, node) => {\n const src = (node as HTMLElement).getAttribute(\"src\") ?? \"\";\n if (!src) {\n return \"\";\n }\n return `[Embedded content: ${src}]`;\n },\n });\n\n return turndown.turndown(prepared);\n}\n","import { Readability } from \"@mozilla/readability\";\nimport { JSDOM } from \"jsdom\";\n\nexport interface ExtractOptions {\n raw?: boolean;\n excludeSelectors?: string[];\n baseUrl: string;\n}\n\nexport interface ExtractedContent {\n html: string;\n metadata: {\n title?: string;\n description?: string;\n author?: string;\n source: string;\n };\n}\n\nfunction removeNodes(document: Document, selectors: string[]) {\n for (const selector of selectors) {\n for (const node of Array.from(document.querySelectorAll(selector))) {\n node.remove();\n }\n }\n}\n\nfunction extractMetadata(document: Document, source: string) {\n const title =\n document.querySelector(\"title\")?.textContent ??\n document\n .querySelector('meta[property=\"og:title\"]')\n ?.getAttribute(\"content\") ??\n undefined;\n\n const description =\n document\n .querySelector('meta[name=\"description\"]')\n ?.getAttribute(\"content\") ??\n document\n .querySelector('meta[property=\"og:description\"]')\n ?.getAttribute(\"content\") ??\n undefined;\n\n const author =\n document.querySelector('meta[name=\"author\"]')?.getAttribute(\"content\") ??\n document\n .querySelector('meta[property=\"article:author\"]')\n ?.getAttribute(\"content\") ??\n undefined;\n\n return { author, description, source, title: title ?? undefined };\n}\n\nexport function extractContent(\n html: string,\n { raw = false, excludeSelectors = [], baseUrl }: ExtractOptions\n): ExtractedContent {\n const dom = new JSDOM(html, { url: baseUrl });\n const { document } = dom.window;\n\n if (excludeSelectors.length) {\n removeNodes(document, excludeSelectors);\n }\n\n if (raw) {\n const metadata = extractMetadata(document, baseUrl);\n return { html: document.documentElement.outerHTML, metadata };\n }\n\n const clone = document.cloneNode(true) as Document;\n const reader = new Readability(clone);\n const article = reader.parse();\n\n const contentHtml =\n article?.content ?? document.querySelector(\"body\")?.innerHTML ?? \"\";\n const metadata = extractMetadata(document, baseUrl);\n if (article?.title && !metadata.title) {\n metadata.title = article.title;\n }\n if (article?.byline && !metadata.author) {\n metadata.author = article.byline;\n }\n return { html: contentHtml, metadata };\n}\n","import { createHash } from \"node:crypto\";\nimport { mkdir, readFile, stat, writeFile } from \"node:fs/promises\";\nimport { dirname, join } from \"node:path\";\n\nexport interface CacheOptions {\n enabled: boolean;\n ttlMs: number;\n cacheDir?: string;\n}\n\nexport interface CachedResponse {\n url: string;\n fetchedAt: number;\n content: string;\n}\n\nconst defaultCacheDir = join(\n process.env.HOME ?? process.cwd(),\n \".cache\",\n \"into-md\"\n);\n\nconst DEFAULT_TTL_MS = 60 * 60 * 1000;\n\nconst buildCachePath = (url: string, cacheDir = defaultCacheDir): string => {\n const hash = createHash(\"sha256\").update(url).digest(\"hex\");\n return join(cacheDir, `${hash}.json`);\n};\n\nexport async function readFromCache(\n url: string,\n options?: Partial<CacheOptions>\n): Promise<CachedResponse | null> {\n const {\n enabled = true,\n ttlMs = DEFAULT_TTL_MS,\n cacheDir = defaultCacheDir,\n } = options ?? {};\n\n if (!enabled) {\n return null;\n }\n\n const target = buildCachePath(url, cacheDir);\n try {\n const [file, info] = await Promise.all([\n readFile(target, \"utf8\"),\n stat(target),\n ]);\n const payload = JSON.parse(file) as CachedResponse;\n const isFresh = info.mtimeMs + ttlMs > Date.now();\n if (!isFresh) {\n return null;\n }\n if (payload.url !== url) {\n return null;\n }\n return payload;\n } catch {\n return null;\n }\n}\n\nexport async function writeToCache(\n url: string,\n content: string,\n options?: Partial<CacheOptions>\n): Promise<void> {\n const { enabled = true, cacheDir = defaultCacheDir } = options ?? {};\n\n if (!enabled) {\n return;\n }\n\n const target = buildCachePath(url, cacheDir);\n await mkdir(dirname(target), { recursive: true });\n const payload: CachedResponse = { content, fetchedAt: Date.now(), url };\n await writeFile(target, JSON.stringify(payload, null, 2), \"utf8\");\n}\n","import { readFileSync } from \"node:fs\";\nimport { basename } from \"node:path\";\nimport { type CacheOptions, readFromCache, writeToCache } from \"./cache\";\n\nexport interface FetchOptions {\n useJs?: boolean;\n cookiesPath?: string;\n userAgent?: string;\n encoding?: string;\n timeoutMs?: number;\n cache?: Partial<CacheOptions>;\n noCache?: boolean;\n verbose?: boolean;\n}\n\nexport interface FetchResult {\n html: string;\n finalUrl: string;\n fromCache: boolean;\n}\n\ninterface CookieRecord {\n name: string;\n value: string;\n domain: string;\n path: string;\n secure: boolean;\n expires: number;\n}\n\nconst DEFAULT_USER_AGENT =\n \"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0 Safari/537.36\";\n\nconst DEFAULT_TIMEOUT_MS = 30_000;\n\nconst logVerbose = (message: string, verbose?: boolean): void => {\n if (verbose) {\n console.error(message);\n }\n};\n\nfunction parseNetscapeCookieLine(\n line: string\n): { record: CookieRecord; headerPair: string } | null {\n const trimmed = line.trim();\n if (!trimmed || trimmed.startsWith(\"#\")) {\n return null;\n }\n\n const parts = trimmed.split(\"\\t\");\n if (parts.length < 7) {\n return null;\n }\n\n const domain = parts[0];\n const path = parts[2];\n const secureFlag = parts[3];\n const expires = parts[4];\n const name = parts[5];\n const value = parts[6];\n if (!(domain && path && secureFlag && expires && name && value)) {\n return null;\n }\n\n return {\n headerPair: `${name}=${value}`,\n record: {\n domain,\n expires: Number(expires),\n name,\n path,\n secure: secureFlag.toLowerCase() === \"true\",\n value,\n },\n };\n}\n\nfunction parseCookiesFile(cookiesPath?: string): {\n header: string | undefined;\n playwrightCookies: CookieRecord[];\n} {\n if (!cookiesPath) {\n return { header: undefined, playwrightCookies: [] };\n }\n let content: string;\n try {\n content = readFileSync(cookiesPath, \"utf8\");\n } catch (error) {\n throw new Error(\n `Unable to read cookies file \"${basename(cookiesPath)}\": ${String(error)}`,\n { cause: error }\n );\n }\n\n const entries: CookieRecord[] = [];\n const headerPairs: string[] = [];\n for (const line of content.split(\"\\n\")) {\n const parsed = parseNetscapeCookieLine(line);\n if (!parsed) {\n continue;\n }\n entries.push(parsed.record);\n headerPairs.push(parsed.headerPair);\n }\n\n return {\n header: headerPairs.length ? headerPairs.join(\"; \") : undefined,\n playwrightCookies: entries,\n };\n}\n\nasync function fetchWithHttp(\n url: string,\n options: FetchOptions\n): Promise<FetchResult> {\n const controller = new AbortController();\n const timeout = setTimeout(\n () => controller.abort(),\n options.timeoutMs ?? DEFAULT_TIMEOUT_MS\n );\n\n const { header: cookiesHeader } = parseCookiesFile(options.cookiesPath);\n const headers = new Headers({\n \"User-Agent\": options.userAgent ?? DEFAULT_USER_AGENT,\n });\n if (cookiesHeader) {\n headers.set(\"Cookie\", cookiesHeader);\n }\n\n try {\n const response = await fetch(url, {\n headers,\n method: \"GET\",\n redirect: \"follow\",\n signal: controller.signal,\n });\n if (!response.ok) {\n throw new Error(\n `Request failed with status ${response.status}. If blocked, try --user-agent.`\n );\n }\n\n const finalUrl = response.url;\n const buffer = await response.arrayBuffer();\n const decoder = new TextDecoder(options.encoding);\n const html = decoder.decode(buffer);\n return { finalUrl, fromCache: false, html };\n } catch (error) {\n const prefix =\n error instanceof Error && error.name === \"AbortError\"\n ? \"Request timed out\"\n : \"Request failed\";\n throw new Error(`${prefix}: ${String(error)}`, { cause: error });\n } finally {\n clearTimeout(timeout);\n }\n}\n\nasync function fetchWithBrowser(\n url: string,\n options: FetchOptions\n): Promise<FetchResult> {\n let playwright: typeof import(\"playwright\") | null = null;\n try {\n playwright = await import(\"playwright\");\n } catch (error) {\n throw new Error(\n `JS mode requested but playwright is not installed. Install it and retry. (${String(\n error\n )})`,\n { cause: error }\n );\n }\n\n const { playwrightCookies } = parseCookiesFile(options.cookiesPath);\n const browser = await playwright.chromium.launch({ headless: true });\n const context = await browser.newContext({\n userAgent: options.userAgent ?? DEFAULT_USER_AGENT,\n });\n\n if (playwrightCookies.length) {\n await context.addCookies(\n playwrightCookies.map((cookie) => ({\n ...cookie,\n httpOnly: false,\n sameSite: \"Lax\" as const,\n }))\n );\n }\n\n const page = await context.newPage();\n await page.goto(url, {\n timeout: options.timeoutMs ?? DEFAULT_TIMEOUT_MS,\n waitUntil: \"networkidle\",\n });\n\n const html = await page.content();\n const finalUrl = page.url();\n\n await browser.close();\n return { finalUrl, fromCache: false, html };\n}\n\nexport async function fetchPage(\n url: string,\n options: FetchOptions\n): Promise<FetchResult> {\n const cacheEnabled = !options.noCache;\n if (cacheEnabled) {\n const cached = await readFromCache(url, {\n enabled: cacheEnabled,\n ...options.cache,\n });\n if (cached) {\n logVerbose(\"Cache hit\", options.verbose);\n return { finalUrl: url, fromCache: true, html: cached.content };\n }\n }\n\n logVerbose(\n `Fetching ${url} ${options.useJs ? \"(headless browser)\" : \"(http)\"}`,\n options.verbose\n );\n const result = options.useJs\n ? await fetchWithBrowser(url, options)\n : await fetchWithHttp(url, options);\n\n if (cacheEnabled) {\n await writeToCache(url, result.html, {\n enabled: cacheEnabled,\n ...options.cache,\n });\n }\n\n return result;\n}\n","import { load } from \"cheerio\";\n\nimport { getBodyHtml, toAbsoluteUrl } from \"./utils\";\n\nexport function annotateImages(html: string, baseUrl: string): string {\n const $ = load(html);\n\n for (const img of $(\"img\").toArray()) {\n const $img = $(img);\n const src = $img.attr(\"src\");\n const absoluteSrc = toAbsoluteUrl(src, baseUrl);\n if (absoluteSrc) {\n $img.attr(\"src\", absoluteSrc);\n }\n\n const figure = $img.closest(\"figure\");\n const caption =\n figure.find(\"figcaption\").text().trim() ||\n $img.attr(\"title\")?.trim() ||\n undefined;\n if (caption) {\n $img.attr(\"data-into-md-caption\", caption);\n }\n }\n\n return getBodyHtml($);\n}\n","export interface FrontmatterInput {\n title?: string;\n description?: string;\n author?: string;\n date?: string;\n source: string;\n}\n\nexport function buildFrontmatter(meta: FrontmatterInput): string {\n const lines = [\"---\"];\n if (meta.title) {\n lines.push(`title: \"${escapeFrontmatter(meta.title)}\"`);\n }\n if (meta.description) {\n lines.push(`description: \"${escapeFrontmatter(meta.description)}\"`);\n }\n if (meta.author) {\n lines.push(`author: \"${escapeFrontmatter(meta.author)}\"`);\n }\n if (meta.date) {\n lines.push(`date: \"${escapeFrontmatter(meta.date)}\"`);\n }\n lines.push(`source: \"${escapeFrontmatter(meta.source)}\"`);\n lines.push(\"---\");\n return lines.join(\"\\n\");\n}\n\nfunction escapeFrontmatter(value: string): string {\n return value.replaceAll('\"', String.raw`\\\"`);\n}\n","import { type Cheerio, type CheerioAPI, load } from \"cheerio\";\nimport type { AnyNode } from \"domhandler\";\n\nimport { getBodyHtml } from \"./utils\";\n\ninterface TableJson {\n caption?: string;\n headers: string[];\n rows: Record<string, string>[];\n}\n\nfunction extractHeaders($table: Cheerio<AnyNode>, $: CheerioAPI): string[] {\n const explicitHeaders = $table.find(\"thead th\");\n if (explicitHeaders.length) {\n return explicitHeaders\n .toArray()\n .map((th) => $(th).text().trim())\n .filter(Boolean);\n }\n\n const firstRowHeaders = $table.find(\"tr\").first().find(\"th, td\");\n if (firstRowHeaders.length) {\n return firstRowHeaders\n .toArray()\n .map((cell, index) => $(cell).text().trim() || `Column ${index + 1}`);\n }\n\n return [];\n}\n\nfunction extractRows(\n $table: Cheerio<AnyNode>,\n headers: string[],\n $: CheerioAPI\n): Record<string, string>[] {\n const rows: Record<string, string>[] = [];\n const dataRows =\n $table.find(\"tbody tr\").length > 0\n ? $table.find(\"tbody tr\")\n : $table.find(\"tr\").slice(1);\n\n for (const row of dataRows.toArray()) {\n const cells = $(row).find(\"td, th\");\n if (!cells.length) {\n continue;\n }\n const record: Record<string, string> = {};\n for (const [cellIndex, cell] of cells.toArray().entries()) {\n const key = headers[cellIndex] ?? `Column ${cellIndex + 1}`;\n record[key] = $(cell).text().trim();\n }\n rows.push(record);\n }\n\n return rows;\n}\n\nexport function convertTablesToJson(html: string): string {\n const $ = load(html);\n\n for (const table of $(\"table\").toArray()) {\n const $table = $(table);\n const caption = $table.find(\"caption\").first().text().trim() || undefined;\n const headers = extractHeaders($table, $);\n const rows = extractRows($table, headers, $);\n\n const json: TableJson = {\n caption,\n headers,\n rows,\n };\n\n const pre = $(\"<pre>\")\n .attr(\"data-into-md-table\", \"true\")\n .text(JSON.stringify(json, null, 2));\n $table.replaceWith(pre);\n }\n\n return getBodyHtml($);\n}\n","import { writeFile } from \"node:fs/promises\";\nimport { Command } from \"commander\";\n\nimport { convertHtmlToMarkdown } from \"./converter\";\nimport { extractContent } from \"./extractor\";\nimport { fetchPage } from \"./fetcher\";\nimport { annotateImages } from \"./images\";\nimport { buildFrontmatter } from \"./metadata\";\nimport { convertTablesToJson } from \"./tables\";\n\nconst DEFAULT_TIMEOUT = 30_000;\n\ninterface CliOptions {\n output?: string;\n js?: boolean;\n raw?: boolean;\n cookies?: string;\n userAgent?: string;\n encoding?: string;\n stripLinks?: boolean;\n exclude?: string;\n timeout?: number;\n noCache?: boolean;\n verbose?: boolean;\n}\n\nasync function run(url: string, options: CliOptions) {\n const selectors =\n options.exclude\n ?.split(\",\")\n .map((selector) => selector.trim())\n .filter(Boolean) ?? [];\n\n if (options.verbose) {\n console.error(\"Starting into-md…\");\n }\n\n const fetchResult = await fetchPage(url, {\n cookiesPath: options.cookies,\n encoding: options.encoding,\n noCache: options.noCache,\n timeoutMs: options.timeout ?? DEFAULT_TIMEOUT,\n useJs: options.js,\n userAgent: options.userAgent,\n verbose: options.verbose,\n });\n\n const extracted = extractContent(fetchResult.html, {\n baseUrl: fetchResult.finalUrl,\n excludeSelectors: selectors,\n raw: options.raw,\n });\n\n let workingHtml = extracted.html;\n workingHtml = convertTablesToJson(workingHtml);\n workingHtml = annotateImages(workingHtml, fetchResult.finalUrl);\n\n const markdown = convertHtmlToMarkdown(workingHtml, {\n baseUrl: fetchResult.finalUrl,\n stripLinks: options.stripLinks,\n });\n\n const frontmatter = buildFrontmatter({\n ...extracted.metadata,\n source: fetchResult.finalUrl,\n });\n\n const output = `${frontmatter}\\n\\n${markdown}`.trim();\n\n if (options.output) {\n await writeFile(options.output, output, \"utf8\");\n if (options.verbose) {\n console.error(`Saved to ${options.output}`);\n }\n } else {\n console.log(output);\n }\n\n const size = Buffer.byteLength(output, \"utf8\");\n if (size > 100_000) {\n console.error(\n `Warning: Output is ${Math.round(size / 1024)}KB. Large documents may exceed LLM context limits.`\n );\n }\n}\n\nfunction buildProgram() {\n const program = new Command()\n .name(\"into-md\")\n .description(\"Fetch a web page and convert its content to markdown.\")\n .argument(\"<url>\", \"URL to fetch\")\n .option(\"-o, --output <file>\", \"Write output to file instead of stdout\")\n .option(\"--js\", \"Use headless browser (Playwright) for JS-rendered content\")\n .option(\"--raw\", \"Skip content extraction, convert entire HTML\")\n .option(\n \"--cookies <file>\",\n \"Path to cookies file for authenticated requests\"\n )\n .option(\"--user-agent <string>\", \"Custom User-Agent header\")\n .option(\n \"--encoding <encoding>\",\n \"Force character encoding (auto-detected by default)\"\n )\n .option(\"--strip-links\", \"Remove hyperlinks, keep only anchor text\")\n .option(\n \"--exclude <selectors>\",\n \"CSS selectors to exclude (comma-separated)\"\n )\n .option(\n \"--timeout <ms>\",\n \"Request timeout in milliseconds\",\n `${DEFAULT_TIMEOUT}`\n )\n .option(\"--no-cache\", \"Bypass response cache\")\n .option(\"-v, --verbose\", \"Show detailed progress information\");\n\n program.version(\"0.1.0\");\n return program;\n}\n\nasync function main() {\n const program = buildProgram();\n program.parse(process.argv);\n const [url] = program.args;\n if (!url) {\n program.help();\n return;\n }\n\n const opts = program.opts<CliOptions>();\n try {\n await run(url, {\n ...opts,\n timeout: opts.timeout ? Number(opts.timeout) : DEFAULT_TIMEOUT,\n });\n } catch (error) {\n const message = error instanceof Error ? error.message : String(error);\n console.error(message);\n process.exitCode = 1;\n }\n}\n\nmain();\n"],"mappings":";;;;;;;;;;;;;;;;;AAMA,MAAa,iBACX,KACA,YACuB;AACvB,KAAI,CAAC,IACH;AAEF,KAAI;AACF,SAAO,IAAI,IAAI,KAAK,QAAQ,CAAC,UAAU;SACjC;AACN,SAAO;;;;;;;AAQX,MAAa,eAAe,MAA0B;CACpD,MAAM,OAAO,EAAE,OAAO;AACtB,QAAO,KAAK,SAAU,KAAK,MAAM,IAAI,KAAO,EAAE,MAAM,CAAC,MAAM,IAAI;;;;;AChBjE,SAAS,WAAW,MAAc,SAAyB;CACzD,MAAM,IAAI,KAAK,KAAK;AAEpB,MAAK,MAAM,MAAM,EAAE,UAAU,CAAC,SAAS,EAAE;EACvC,MAAM,MAAM,EAAE,GAAG;EACjB,MAAM,WAAW,cAAc,IAAI,KAAK,OAAO,EAAE,QAAQ;AACzD,MAAI,SACF,KAAI,KAAK,QAAQ,SAAS;;AAI9B,MAAK,MAAM,MAAM,EAAE,WAAW,CAAC,SAAS,EAAE;EACxC,MAAM,MAAM,EAAE,GAAG;EACjB,MAAM,WAAW,cAAc,IAAI,KAAK,MAAM,EAAE,QAAQ;AACxD,MAAI,SACF,KAAI,KAAK,OAAO,SAAS;;AAI7B,GAAE,gBAAgB,CAAC,QAAQ;AAC3B,QAAO,YAAY,EAAE;;AAGvB,SAAgB,sBACd,MACA,SACQ;CACR,MAAM,WAAW,WAAW,MAAM,QAAQ,QAAQ;CAClD,MAAM,WAAW,IAAI,gBAAgB;EACnC,kBAAkB;EAClB,gBAAgB;EAChB,cAAc;EACf,CAAC;AAEF,UAAS,QAAQ,cAAc;EAC7B,QAAQ;EACR,cAAc,SAAS,SAAS;AAC9B,OAAI,QAAQ,WACV,QAAO;GAET,MAAM,OAAQ,KAAqB,aAAa,OAAO;AACvD,OAAI,CAAC,KACH,QAAO;AAET,UAAO,IAAI,QAAQ,IAAI,KAAK;;EAE/B,CAAC;AAEF,UAAS,QAAQ,qBAAqB;EACpC,QAAQ;EACR,cAAc,GAAG,SAAS;GACxB,MAAM,UAAU;GAChB,MAAM,MAAM,QAAQ,aAAa,MAAM,IAAI;GAC3C,MAAM,MAAM,QAAQ,aAAa,MAAM,IAAI;GAC3C,MAAM,UAAU,QAAQ,aAAa,uBAAuB;GAC5D,MAAM,YAAY,KAAK,IAAI,IAAI,IAAI;AACnC,OAAI,QACF,QAAO,GAAG,UAAU,KAAK,QAAQ;AAEnC,UAAO;;EAEV,CAAC;AAEF,UAAS,QAAQ,aAAa;EAC5B,SAAS,SACP,KAAK,aAAa,SACjB,KAAqB,aAAa,qBAAqB,KAAK;EAC/D,cAAc,UAAU,SAAS;AAE/B,UAAO,eADO,KAAqB,aAAa,MAAM,IAAI,GAC/B;;EAE9B,CAAC;AAEF,UAAS,QAAQ,UAAU;EACzB,QAAQ;GAAC;GAAU;GAAS;GAAQ;EACpC,cAAc,GAAG,SAAS;GACxB,MAAM,MAAO,KAAqB,aAAa,MAAM,IAAI;AACzD,OAAI,CAAC,IACH,QAAO;AAET,UAAO,sBAAsB,IAAI;;EAEpC,CAAC;AAEF,QAAO,SAAS,SAAS,SAAS;;;;;AC3EpC,SAAS,YAAY,UAAoB,WAAqB;AAC5D,MAAK,MAAM,YAAY,UACrB,MAAK,MAAM,QAAQ,MAAM,KAAK,SAAS,iBAAiB,SAAS,CAAC,CAChE,MAAK,QAAQ;;AAKnB,SAAS,gBAAgB,UAAoB,QAAgB;CAC3D,MAAM,QACJ,SAAS,cAAc,QAAQ,EAAE,eACjC,SACG,cAAc,8BAA4B,EACzC,aAAa,UAAU,IAC3B;CAEF,MAAM,cACJ,SACG,cAAc,6BAA2B,EACxC,aAAa,UAAU,IAC3B,SACG,cAAc,oCAAkC,EAC/C,aAAa,UAAU,IAC3B;AASF,QAAO;EAAE,QANP,SAAS,cAAc,wBAAsB,EAAE,aAAa,UAAU,IACtE,SACG,cAAc,oCAAkC,EAC/C,aAAa,UAAU,IAC3B;EAEe;EAAa;EAAQ,OAAO,SAAS;EAAW;;AAGnE,SAAgB,eACd,MACA,EAAE,MAAM,OAAO,mBAAmB,EAAE,EAAE,WACpB;CAElB,MAAM,EAAE,aADI,IAAI,MAAM,MAAM,EAAE,KAAK,SAAS,CAAC,CACpB;AAEzB,KAAI,iBAAiB,OACnB,aAAY,UAAU,iBAAiB;AAGzC,KAAI,KAAK;EACP,MAAMA,aAAW,gBAAgB,UAAU,QAAQ;AACnD,SAAO;GAAE,MAAM,SAAS,gBAAgB;GAAW;GAAU;;CAK/D,MAAM,UADS,IAAI,YADL,SAAS,UAAU,KAAK,CACD,CACd,OAAO;CAE9B,MAAM,cACJ,SAAS,WAAW,SAAS,cAAc,OAAO,EAAE,aAAa;CACnE,MAAM,WAAW,gBAAgB,UAAU,QAAQ;AACnD,KAAI,SAAS,SAAS,CAAC,SAAS,MAC9B,UAAS,QAAQ,QAAQ;AAE3B,KAAI,SAAS,UAAU,CAAC,SAAS,OAC/B,UAAS,SAAS,QAAQ;AAE5B,QAAO;EAAE,MAAM;EAAa;EAAU;;;;;ACnExC,MAAM,kBAAkB,KACtB,QAAQ,IAAI,QAAQ,QAAQ,KAAK,EACjC,UACA,UACD;AAED,MAAM,iBAAiB,OAAU;AAEjC,MAAM,kBAAkB,KAAa,WAAW,oBAA4B;AAE1E,QAAO,KAAK,UAAU,GADT,WAAW,SAAS,CAAC,OAAO,IAAI,CAAC,OAAO,MAAM,CAC7B,OAAO;;AAGvC,eAAsB,cACpB,KACA,SACgC;CAChC,MAAM,EACJ,UAAU,MACV,QAAQ,gBACR,WAAW,oBACT,WAAW,EAAE;AAEjB,KAAI,CAAC,QACH,QAAO;CAGT,MAAM,SAAS,eAAe,KAAK,SAAS;AAC5C,KAAI;EACF,MAAM,CAAC,MAAM,QAAQ,MAAM,QAAQ,IAAI,CACrC,SAAS,QAAQ,OAAO,EACxB,KAAK,OAAO,CACb,CAAC;EACF,MAAM,UAAU,KAAK,MAAM,KAAK;AAEhC,MAAI,EADY,KAAK,UAAU,QAAQ,KAAK,KAAK,EAE/C,QAAO;AAET,MAAI,QAAQ,QAAQ,IAClB,QAAO;AAET,SAAO;SACD;AACN,SAAO;;;AAIX,eAAsB,aACpB,KACA,SACA,SACe;CACf,MAAM,EAAE,UAAU,MAAM,WAAW,oBAAoB,WAAW,EAAE;AAEpE,KAAI,CAAC,QACH;CAGF,MAAM,SAAS,eAAe,KAAK,SAAS;AAC5C,OAAM,MAAM,QAAQ,OAAO,EAAE,EAAE,WAAW,MAAM,CAAC;CACjD,MAAM,UAA0B;EAAE;EAAS,WAAW,KAAK,KAAK;EAAE;EAAK;AACvE,OAAM,UAAU,QAAQ,KAAK,UAAU,SAAS,MAAM,EAAE,EAAE,OAAO;;;;;AC/CnE,MAAM,qBACJ;AAEF,MAAM,qBAAqB;AAE3B,MAAM,cAAc,SAAiB,YAA4B;AAC/D,KAAI,QACF,SAAQ,MAAM,QAAQ;;AAI1B,SAAS,wBACP,MACqD;CACrD,MAAM,UAAU,KAAK,MAAM;AAC3B,KAAI,CAAC,WAAW,QAAQ,WAAW,IAAI,CACrC,QAAO;CAGT,MAAM,QAAQ,QAAQ,MAAM,IAAK;AACjC,KAAI,MAAM,SAAS,EACjB,QAAO;CAGT,MAAM,SAAS,MAAM;CACrB,MAAM,OAAO,MAAM;CACnB,MAAM,aAAa,MAAM;CACzB,MAAM,UAAU,MAAM;CACtB,MAAM,OAAO,MAAM;CACnB,MAAM,QAAQ,MAAM;AACpB,KAAI,EAAE,UAAU,QAAQ,cAAc,WAAW,QAAQ,OACvD,QAAO;AAGT,QAAO;EACL,YAAY,GAAG,KAAK,GAAG;EACvB,QAAQ;GACN;GACA,SAAS,OAAO,QAAQ;GACxB;GACA;GACA,QAAQ,WAAW,aAAa,KAAK;GACrC;GACD;EACF;;AAGH,SAAS,iBAAiB,aAGxB;AACA,KAAI,CAAC,YACH,QAAO;EAAE,QAAQ;EAAW,mBAAmB,EAAE;EAAE;CAErD,IAAI;AACJ,KAAI;AACF,YAAU,aAAa,aAAa,OAAO;UACpC,OAAO;AACd,QAAM,IAAI,MACR,gCAAgC,SAAS,YAAY,CAAC,KAAK,OAAO,MAAM,IACxE,EAAE,OAAO,OAAO,CACjB;;CAGH,MAAM,UAA0B,EAAE;CAClC,MAAM,cAAwB,EAAE;AAChC,MAAK,MAAM,QAAQ,QAAQ,MAAM,KAAK,EAAE;EACtC,MAAM,SAAS,wBAAwB,KAAK;AAC5C,MAAI,CAAC,OACH;AAEF,UAAQ,KAAK,OAAO,OAAO;AAC3B,cAAY,KAAK,OAAO,WAAW;;AAGrC,QAAO;EACL,QAAQ,YAAY,SAAS,YAAY,KAAK,KAAK,GAAG;EACtD,mBAAmB;EACpB;;AAGH,eAAe,cACb,KACA,SACsB;CACtB,MAAM,aAAa,IAAI,iBAAiB;CACxC,MAAM,UAAU,iBACR,WAAW,OAAO,EACxB,QAAQ,aAAa,mBACtB;CAED,MAAM,EAAE,QAAQ,kBAAkB,iBAAiB,QAAQ,YAAY;CACvE,MAAM,UAAU,IAAI,QAAQ,EAC1B,cAAc,QAAQ,aAAa,oBACpC,CAAC;AACF,KAAI,cACF,SAAQ,IAAI,UAAU,cAAc;AAGtC,KAAI;EACF,MAAM,WAAW,MAAM,MAAM,KAAK;GAChC;GACA,QAAQ;GACR,UAAU;GACV,QAAQ,WAAW;GACpB,CAAC;AACF,MAAI,CAAC,SAAS,GACZ,OAAM,IAAI,MACR,8BAA8B,SAAS,OAAO,iCAC/C;EAGH,MAAM,WAAW,SAAS;EAC1B,MAAM,SAAS,MAAM,SAAS,aAAa;AAG3C,SAAO;GAAE;GAAU,WAAW;GAAO,MAFrB,IAAI,YAAY,QAAQ,SAAS,CAC5B,OAAO,OAAO;GACQ;UACpC,OAAO;EACd,MAAM,SACJ,iBAAiB,SAAS,MAAM,SAAS,eACrC,sBACA;AACN,QAAM,IAAI,MAAM,GAAG,OAAO,IAAI,OAAO,MAAM,IAAI,EAAE,OAAO,OAAO,CAAC;WACxD;AACR,eAAa,QAAQ;;;AAIzB,eAAe,iBACb,KACA,SACsB;CACtB,IAAI,aAAiD;AACrD,KAAI;AACF,eAAa,MAAM,OAAO;UACnB,OAAO;AACd,QAAM,IAAI,MACR,6EAA6E,OAC3E,MACD,CAAC,IACF,EAAE,OAAO,OAAO,CACjB;;CAGH,MAAM,EAAE,sBAAsB,iBAAiB,QAAQ,YAAY;CACnE,MAAM,UAAU,MAAM,WAAW,SAAS,OAAO,EAAE,UAAU,MAAM,CAAC;CACpE,MAAM,UAAU,MAAM,QAAQ,WAAW,EACvC,WAAW,QAAQ,aAAa,oBACjC,CAAC;AAEF,KAAI,kBAAkB,OACpB,OAAM,QAAQ,WACZ,kBAAkB,KAAK,YAAY;EACjC,GAAG;EACH,UAAU;EACV,UAAU;EACX,EAAE,CACJ;CAGH,MAAM,OAAO,MAAM,QAAQ,SAAS;AACpC,OAAM,KAAK,KAAK,KAAK;EACnB,SAAS,QAAQ,aAAa;EAC9B,WAAW;EACZ,CAAC;CAEF,MAAM,OAAO,MAAM,KAAK,SAAS;CACjC,MAAM,WAAW,KAAK,KAAK;AAE3B,OAAM,QAAQ,OAAO;AACrB,QAAO;EAAE;EAAU,WAAW;EAAO;EAAM;;AAG7C,eAAsB,UACpB,KACA,SACsB;CACtB,MAAM,eAAe,CAAC,QAAQ;AAC9B,KAAI,cAAc;EAChB,MAAM,SAAS,MAAM,cAAc,KAAK;GACtC,SAAS;GACT,GAAG,QAAQ;GACZ,CAAC;AACF,MAAI,QAAQ;AACV,cAAW,aAAa,QAAQ,QAAQ;AACxC,UAAO;IAAE,UAAU;IAAK,WAAW;IAAM,MAAM,OAAO;IAAS;;;AAInE,YACE,YAAY,IAAI,GAAG,QAAQ,QAAQ,uBAAuB,YAC1D,QAAQ,QACT;CACD,MAAM,SAAS,QAAQ,QACnB,MAAM,iBAAiB,KAAK,QAAQ,GACpC,MAAM,cAAc,KAAK,QAAQ;AAErC,KAAI,aACF,OAAM,aAAa,KAAK,OAAO,MAAM;EACnC,SAAS;EACT,GAAG,QAAQ;EACZ,CAAC;AAGJ,QAAO;;;;;ACtOT,SAAgB,eAAe,MAAc,SAAyB;CACpE,MAAM,IAAI,KAAK,KAAK;AAEpB,MAAK,MAAM,OAAO,EAAE,MAAM,CAAC,SAAS,EAAE;EACpC,MAAM,OAAO,EAAE,IAAI;EAEnB,MAAM,cAAc,cADR,KAAK,KAAK,MAAM,EACW,QAAQ;AAC/C,MAAI,YACF,MAAK,KAAK,OAAO,YAAY;EAI/B,MAAM,UADS,KAAK,QAAQ,SAAS,CAE5B,KAAK,aAAa,CAAC,MAAM,CAAC,MAAM,IACvC,KAAK,KAAK,QAAQ,EAAE,MAAM,IAC1B;AACF,MAAI,QACF,MAAK,KAAK,wBAAwB,QAAQ;;AAI9C,QAAO,YAAY,EAAE;;;;;ACjBvB,SAAgB,iBAAiB,MAAgC;CAC/D,MAAM,QAAQ,CAAC,MAAM;AACrB,KAAI,KAAK,MACP,OAAM,KAAK,WAAW,kBAAkB,KAAK,MAAM,CAAC,GAAG;AAEzD,KAAI,KAAK,YACP,OAAM,KAAK,iBAAiB,kBAAkB,KAAK,YAAY,CAAC,GAAG;AAErE,KAAI,KAAK,OACP,OAAM,KAAK,YAAY,kBAAkB,KAAK,OAAO,CAAC,GAAG;AAE3D,KAAI,KAAK,KACP,OAAM,KAAK,UAAU,kBAAkB,KAAK,KAAK,CAAC,GAAG;AAEvD,OAAM,KAAK,YAAY,kBAAkB,KAAK,OAAO,CAAC,GAAG;AACzD,OAAM,KAAK,MAAM;AACjB,QAAO,MAAM,KAAK,KAAK;;AAGzB,SAAS,kBAAkB,OAAuB;AAChD,QAAO,MAAM,WAAW,MAAK,OAAO,GAAG,KAAK;;;;;ACjB9C,SAAS,eAAe,QAA0B,GAAyB;CACzE,MAAM,kBAAkB,OAAO,KAAK,WAAW;AAC/C,KAAI,gBAAgB,OAClB,QAAO,gBACJ,SAAS,CACT,KAAK,OAAO,EAAE,GAAG,CAAC,MAAM,CAAC,MAAM,CAAC,CAChC,OAAO,QAAQ;CAGpB,MAAM,kBAAkB,OAAO,KAAK,KAAK,CAAC,OAAO,CAAC,KAAK,SAAS;AAChE,KAAI,gBAAgB,OAClB,QAAO,gBACJ,SAAS,CACT,KAAK,MAAM,UAAU,EAAE,KAAK,CAAC,MAAM,CAAC,MAAM,IAAI,UAAU,QAAQ,IAAI;AAGzE,QAAO,EAAE;;AAGX,SAAS,YACP,QACA,SACA,GAC0B;CAC1B,MAAM,OAAiC,EAAE;CACzC,MAAM,WACJ,OAAO,KAAK,WAAW,CAAC,SAAS,IAC7B,OAAO,KAAK,WAAW,GACvB,OAAO,KAAK,KAAK,CAAC,MAAM,EAAE;AAEhC,MAAK,MAAM,OAAO,SAAS,SAAS,EAAE;EACpC,MAAM,QAAQ,EAAE,IAAI,CAAC,KAAK,SAAS;AACnC,MAAI,CAAC,MAAM,OACT;EAEF,MAAM,SAAiC,EAAE;AACzC,OAAK,MAAM,CAAC,WAAW,SAAS,MAAM,SAAS,CAAC,SAAS,EAAE;GACzD,MAAM,MAAM,QAAQ,cAAc,UAAU,YAAY;AACxD,UAAO,OAAO,EAAE,KAAK,CAAC,MAAM,CAAC,MAAM;;AAErC,OAAK,KAAK,OAAO;;AAGnB,QAAO;;AAGT,SAAgB,oBAAoB,MAAsB;CACxD,MAAM,IAAI,KAAK,KAAK;AAEpB,MAAK,MAAM,SAAS,EAAE,QAAQ,CAAC,SAAS,EAAE;EACxC,MAAM,SAAS,EAAE,MAAM;EACvB,MAAM,UAAU,OAAO,KAAK,UAAU,CAAC,OAAO,CAAC,MAAM,CAAC,MAAM,IAAI;EAChE,MAAM,UAAU,eAAe,QAAQ,EAAE;EAGzC,MAAM,OAAkB;GACtB;GACA;GACA,MALW,YAAY,QAAQ,SAAS,EAAE;GAM3C;EAED,MAAM,MAAM,EAAE,QAAQ,CACnB,KAAK,sBAAsB,OAAO,CAClC,KAAK,KAAK,UAAU,MAAM,MAAM,EAAE,CAAC;AACtC,SAAO,YAAY,IAAI;;AAGzB,QAAO,YAAY,EAAE;;;;;ACpEvB,MAAM,kBAAkB;AAgBxB,eAAe,IAAI,KAAa,SAAqB;CACnD,MAAM,YACJ,QAAQ,SACJ,MAAM,IAAI,CACX,KAAK,aAAa,SAAS,MAAM,CAAC,CAClC,OAAO,QAAQ,IAAI,EAAE;AAE1B,KAAI,QAAQ,QACV,SAAQ,MAAM,oBAAoB;CAGpC,MAAM,cAAc,MAAM,UAAU,KAAK;EACvC,aAAa,QAAQ;EACrB,UAAU,QAAQ;EAClB,SAAS,QAAQ;EACjB,WAAW,QAAQ,WAAW;EAC9B,OAAO,QAAQ;EACf,WAAW,QAAQ;EACnB,SAAS,QAAQ;EAClB,CAAC;CAEF,MAAM,YAAY,eAAe,YAAY,MAAM;EACjD,SAAS,YAAY;EACrB,kBAAkB;EAClB,KAAK,QAAQ;EACd,CAAC;CAEF,IAAI,cAAc,UAAU;AAC5B,eAAc,oBAAoB,YAAY;AAC9C,eAAc,eAAe,aAAa,YAAY,SAAS;CAE/D,MAAM,WAAW,sBAAsB,aAAa;EAClD,SAAS,YAAY;EACrB,YAAY,QAAQ;EACrB,CAAC;CAOF,MAAM,SAAS,GALK,iBAAiB;EACnC,GAAG,UAAU;EACb,QAAQ,YAAY;EACrB,CAAC,CAE4B,MAAM,WAAW,MAAM;AAErD,KAAI,QAAQ,QAAQ;AAClB,QAAM,UAAU,QAAQ,QAAQ,QAAQ,OAAO;AAC/C,MAAI,QAAQ,QACV,SAAQ,MAAM,YAAY,QAAQ,SAAS;OAG7C,SAAQ,IAAI,OAAO;CAGrB,MAAM,OAAO,OAAO,WAAW,QAAQ,OAAO;AAC9C,KAAI,OAAO,IACT,SAAQ,MACN,sBAAsB,KAAK,MAAM,OAAO,KAAK,CAAC,oDAC/C;;AAIL,SAAS,eAAe;CACtB,MAAM,UAAU,IAAI,SAAS,CAC1B,KAAK,UAAU,CACf,YAAY,wDAAwD,CACpE,SAAS,SAAS,eAAe,CACjC,OAAO,uBAAuB,yCAAyC,CACvE,OAAO,QAAQ,4DAA4D,CAC3E,OAAO,SAAS,+CAA+C,CAC/D,OACC,oBACA,kDACD,CACA,OAAO,yBAAyB,2BAA2B,CAC3D,OACC,yBACA,sDACD,CACA,OAAO,iBAAiB,2CAA2C,CACnE,OACC,yBACA,6CACD,CACA,OACC,kBACA,mCACA,GAAG,kBACJ,CACA,OAAO,cAAc,wBAAwB,CAC7C,OAAO,iBAAiB,qCAAqC;AAEhE,SAAQ,QAAQ,QAAQ;AACxB,QAAO;;AAGT,eAAe,OAAO;CACpB,MAAM,UAAU,cAAc;AAC9B,SAAQ,MAAM,QAAQ,KAAK;CAC3B,MAAM,CAAC,OAAO,QAAQ;AACtB,KAAI,CAAC,KAAK;AACR,UAAQ,MAAM;AACd;;CAGF,MAAM,OAAO,QAAQ,MAAkB;AACvC,KAAI;AACF,QAAM,IAAI,KAAK;GACb,GAAG;GACH,SAAS,KAAK,UAAU,OAAO,KAAK,QAAQ,GAAG;GAChD,CAAC;UACK,OAAO;EACd,MAAM,UAAU,iBAAiB,QAAQ,MAAM,UAAU,OAAO,MAAM;AACtE,UAAQ,MAAM,QAAQ;AACtB,UAAQ,WAAW;;;AAIvB,MAAM"}
|