into-md 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/fetcher.ts ADDED
@@ -0,0 +1,236 @@
1
+ import { readFileSync } from "node:fs";
2
+ import { basename } from "node:path";
3
+ import { type CacheOptions, readFromCache, writeToCache } from "./cache";
4
+
5
+ export interface FetchOptions {
6
+ useJs?: boolean;
7
+ cookiesPath?: string;
8
+ userAgent?: string;
9
+ encoding?: string;
10
+ timeoutMs?: number;
11
+ cache?: Partial<CacheOptions>;
12
+ noCache?: boolean;
13
+ verbose?: boolean;
14
+ }
15
+
16
+ export interface FetchResult {
17
+ html: string;
18
+ finalUrl: string;
19
+ fromCache: boolean;
20
+ }
21
+
22
+ interface CookieRecord {
23
+ name: string;
24
+ value: string;
25
+ domain: string;
26
+ path: string;
27
+ secure: boolean;
28
+ expires: number;
29
+ }
30
+
31
+ const DEFAULT_USER_AGENT =
32
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0 Safari/537.36";
33
+
34
+ const DEFAULT_TIMEOUT_MS = 30_000;
35
+
36
+ const logVerbose = (message: string, verbose?: boolean): void => {
37
+ if (verbose) {
38
+ console.error(message);
39
+ }
40
+ };
41
+
42
+ function parseNetscapeCookieLine(
43
+ line: string
44
+ ): { record: CookieRecord; headerPair: string } | null {
45
+ const trimmed = line.trim();
46
+ if (!trimmed || trimmed.startsWith("#")) {
47
+ return null;
48
+ }
49
+
50
+ const parts = trimmed.split("\t");
51
+ if (parts.length < 7) {
52
+ return null;
53
+ }
54
+
55
+ const domain = parts[0];
56
+ const path = parts[2];
57
+ const secureFlag = parts[3];
58
+ const expires = parts[4];
59
+ const name = parts[5];
60
+ const value = parts[6];
61
+ if (!(domain && path && secureFlag && expires && name && value)) {
62
+ return null;
63
+ }
64
+
65
+ return {
66
+ headerPair: `${name}=${value}`,
67
+ record: {
68
+ domain,
69
+ expires: Number(expires),
70
+ name,
71
+ path,
72
+ secure: secureFlag.toLowerCase() === "true",
73
+ value,
74
+ },
75
+ };
76
+ }
77
+
78
+ function parseCookiesFile(cookiesPath?: string): {
79
+ header: string | undefined;
80
+ playwrightCookies: CookieRecord[];
81
+ } {
82
+ if (!cookiesPath) {
83
+ return { header: undefined, playwrightCookies: [] };
84
+ }
85
+ let content: string;
86
+ try {
87
+ content = readFileSync(cookiesPath, "utf8");
88
+ } catch (error) {
89
+ throw new Error(
90
+ `Unable to read cookies file "${basename(cookiesPath)}": ${String(error)}`,
91
+ { cause: error }
92
+ );
93
+ }
94
+
95
+ const entries: CookieRecord[] = [];
96
+ const headerPairs: string[] = [];
97
+ for (const line of content.split("\n")) {
98
+ const parsed = parseNetscapeCookieLine(line);
99
+ if (!parsed) {
100
+ continue;
101
+ }
102
+ entries.push(parsed.record);
103
+ headerPairs.push(parsed.headerPair);
104
+ }
105
+
106
+ return {
107
+ header: headerPairs.length ? headerPairs.join("; ") : undefined,
108
+ playwrightCookies: entries,
109
+ };
110
+ }
111
+
112
+ async function fetchWithHttp(
113
+ url: string,
114
+ options: FetchOptions
115
+ ): Promise<FetchResult> {
116
+ const controller = new AbortController();
117
+ const timeout = setTimeout(
118
+ () => controller.abort(),
119
+ options.timeoutMs ?? DEFAULT_TIMEOUT_MS
120
+ );
121
+
122
+ const { header: cookiesHeader } = parseCookiesFile(options.cookiesPath);
123
+ const headers = new Headers({
124
+ "User-Agent": options.userAgent ?? DEFAULT_USER_AGENT,
125
+ });
126
+ if (cookiesHeader) {
127
+ headers.set("Cookie", cookiesHeader);
128
+ }
129
+
130
+ try {
131
+ const response = await fetch(url, {
132
+ headers,
133
+ method: "GET",
134
+ redirect: "follow",
135
+ signal: controller.signal,
136
+ });
137
+ if (!response.ok) {
138
+ throw new Error(
139
+ `Request failed with status ${response.status}. If blocked, try --user-agent.`
140
+ );
141
+ }
142
+
143
+ const finalUrl = response.url;
144
+ const buffer = await response.arrayBuffer();
145
+ const decoder = new TextDecoder(options.encoding);
146
+ const html = decoder.decode(buffer);
147
+ return { finalUrl, fromCache: false, html };
148
+ } catch (error) {
149
+ const prefix =
150
+ error instanceof Error && error.name === "AbortError"
151
+ ? "Request timed out"
152
+ : "Request failed";
153
+ throw new Error(`${prefix}: ${String(error)}`, { cause: error });
154
+ } finally {
155
+ clearTimeout(timeout);
156
+ }
157
+ }
158
+
159
+ async function fetchWithBrowser(
160
+ url: string,
161
+ options: FetchOptions
162
+ ): Promise<FetchResult> {
163
+ let playwright: typeof import("playwright") | null = null;
164
+ try {
165
+ playwright = await import("playwright");
166
+ } catch (error) {
167
+ throw new Error(
168
+ `JS mode requested but playwright is not installed. Install it and retry. (${String(
169
+ error
170
+ )})`,
171
+ { cause: error }
172
+ );
173
+ }
174
+
175
+ const { playwrightCookies } = parseCookiesFile(options.cookiesPath);
176
+ const browser = await playwright.chromium.launch({ headless: true });
177
+ const context = await browser.newContext({
178
+ userAgent: options.userAgent ?? DEFAULT_USER_AGENT,
179
+ });
180
+
181
+ if (playwrightCookies.length) {
182
+ await context.addCookies(
183
+ playwrightCookies.map((cookie) => ({
184
+ ...cookie,
185
+ httpOnly: false,
186
+ sameSite: "Lax" as const,
187
+ }))
188
+ );
189
+ }
190
+
191
+ const page = await context.newPage();
192
+ await page.goto(url, {
193
+ timeout: options.timeoutMs ?? DEFAULT_TIMEOUT_MS,
194
+ waitUntil: "networkidle",
195
+ });
196
+
197
+ const html = await page.content();
198
+ const finalUrl = page.url();
199
+
200
+ await browser.close();
201
+ return { finalUrl, fromCache: false, html };
202
+ }
203
+
204
+ export async function fetchPage(
205
+ url: string,
206
+ options: FetchOptions
207
+ ): Promise<FetchResult> {
208
+ const cacheEnabled = !options.noCache;
209
+ if (cacheEnabled) {
210
+ const cached = await readFromCache(url, {
211
+ enabled: cacheEnabled,
212
+ ...options.cache,
213
+ });
214
+ if (cached) {
215
+ logVerbose("Cache hit", options.verbose);
216
+ return { finalUrl: url, fromCache: true, html: cached.content };
217
+ }
218
+ }
219
+
220
+ logVerbose(
221
+ `Fetching ${url} ${options.useJs ? "(headless browser)" : "(http)"}`,
222
+ options.verbose
223
+ );
224
+ const result = options.useJs
225
+ ? await fetchWithBrowser(url, options)
226
+ : await fetchWithHttp(url, options);
227
+
228
+ if (cacheEnabled) {
229
+ await writeToCache(url, result.html, {
230
+ enabled: cacheEnabled,
231
+ ...options.cache,
232
+ });
233
+ }
234
+
235
+ return result;
236
+ }
package/src/images.ts ADDED
@@ -0,0 +1,27 @@
1
+ import { load } from "cheerio";
2
+
3
+ import { getBodyHtml, toAbsoluteUrl } from "./utils";
4
+
5
+ export function annotateImages(html: string, baseUrl: string): string {
6
+ const $ = load(html);
7
+
8
+ for (const img of $("img").toArray()) {
9
+ const $img = $(img);
10
+ const src = $img.attr("src");
11
+ const absoluteSrc = toAbsoluteUrl(src, baseUrl);
12
+ if (absoluteSrc) {
13
+ $img.attr("src", absoluteSrc);
14
+ }
15
+
16
+ const figure = $img.closest("figure");
17
+ const caption =
18
+ figure.find("figcaption").text().trim() ||
19
+ $img.attr("title")?.trim() ||
20
+ undefined;
21
+ if (caption) {
22
+ $img.attr("data-into-md-caption", caption);
23
+ }
24
+ }
25
+
26
+ return getBodyHtml($);
27
+ }
package/src/index.ts ADDED
@@ -0,0 +1,143 @@
1
+ import { writeFile } from "node:fs/promises";
2
+ import { Command } from "commander";
3
+
4
+ import { convertHtmlToMarkdown } from "./converter";
5
+ import { extractContent } from "./extractor";
6
+ import { fetchPage } from "./fetcher";
7
+ import { annotateImages } from "./images";
8
+ import { buildFrontmatter } from "./metadata";
9
+ import { convertTablesToJson } from "./tables";
10
+
11
+ const DEFAULT_TIMEOUT = 30_000;
12
+
13
+ interface CliOptions {
14
+ output?: string;
15
+ js?: boolean;
16
+ raw?: boolean;
17
+ cookies?: string;
18
+ userAgent?: string;
19
+ encoding?: string;
20
+ stripLinks?: boolean;
21
+ exclude?: string;
22
+ timeout?: number;
23
+ noCache?: boolean;
24
+ verbose?: boolean;
25
+ }
26
+
27
+ async function run(url: string, options: CliOptions) {
28
+ const selectors =
29
+ options.exclude
30
+ ?.split(",")
31
+ .map((selector) => selector.trim())
32
+ .filter(Boolean) ?? [];
33
+
34
+ if (options.verbose) {
35
+ console.error("Starting into-md…");
36
+ }
37
+
38
+ const fetchResult = await fetchPage(url, {
39
+ cookiesPath: options.cookies,
40
+ encoding: options.encoding,
41
+ noCache: options.noCache,
42
+ timeoutMs: options.timeout ?? DEFAULT_TIMEOUT,
43
+ useJs: options.js,
44
+ userAgent: options.userAgent,
45
+ verbose: options.verbose,
46
+ });
47
+
48
+ const extracted = extractContent(fetchResult.html, {
49
+ baseUrl: fetchResult.finalUrl,
50
+ excludeSelectors: selectors,
51
+ raw: options.raw,
52
+ });
53
+
54
+ let workingHtml = extracted.html;
55
+ workingHtml = convertTablesToJson(workingHtml);
56
+ workingHtml = annotateImages(workingHtml, fetchResult.finalUrl);
57
+
58
+ const markdown = convertHtmlToMarkdown(workingHtml, {
59
+ baseUrl: fetchResult.finalUrl,
60
+ stripLinks: options.stripLinks,
61
+ });
62
+
63
+ const frontmatter = buildFrontmatter({
64
+ ...extracted.metadata,
65
+ source: fetchResult.finalUrl,
66
+ });
67
+
68
+ const output = `${frontmatter}\n\n${markdown}`.trim();
69
+
70
+ if (options.output) {
71
+ await writeFile(options.output, output, "utf8");
72
+ if (options.verbose) {
73
+ console.error(`Saved to ${options.output}`);
74
+ }
75
+ } else {
76
+ console.log(output);
77
+ }
78
+
79
+ const size = Buffer.byteLength(output, "utf8");
80
+ if (size > 100_000) {
81
+ console.error(
82
+ `Warning: Output is ${Math.round(size / 1024)}KB. Large documents may exceed LLM context limits.`
83
+ );
84
+ }
85
+ }
86
+
87
+ function buildProgram() {
88
+ const program = new Command()
89
+ .name("into-md")
90
+ .description("Fetch a web page and convert its content to markdown.")
91
+ .argument("<url>", "URL to fetch")
92
+ .option("-o, --output <file>", "Write output to file instead of stdout")
93
+ .option("--js", "Use headless browser (Playwright) for JS-rendered content")
94
+ .option("--raw", "Skip content extraction, convert entire HTML")
95
+ .option(
96
+ "--cookies <file>",
97
+ "Path to cookies file for authenticated requests"
98
+ )
99
+ .option("--user-agent <string>", "Custom User-Agent header")
100
+ .option(
101
+ "--encoding <encoding>",
102
+ "Force character encoding (auto-detected by default)"
103
+ )
104
+ .option("--strip-links", "Remove hyperlinks, keep only anchor text")
105
+ .option(
106
+ "--exclude <selectors>",
107
+ "CSS selectors to exclude (comma-separated)"
108
+ )
109
+ .option(
110
+ "--timeout <ms>",
111
+ "Request timeout in milliseconds",
112
+ `${DEFAULT_TIMEOUT}`
113
+ )
114
+ .option("--no-cache", "Bypass response cache")
115
+ .option("-v, --verbose", "Show detailed progress information");
116
+
117
+ program.version("0.1.0");
118
+ return program;
119
+ }
120
+
121
+ async function main() {
122
+ const program = buildProgram();
123
+ program.parse(process.argv);
124
+ const [url] = program.args;
125
+ if (!url) {
126
+ program.help();
127
+ return;
128
+ }
129
+
130
+ const opts = program.opts<CliOptions>();
131
+ try {
132
+ await run(url, {
133
+ ...opts,
134
+ timeout: opts.timeout ? Number(opts.timeout) : DEFAULT_TIMEOUT,
135
+ });
136
+ } catch (error) {
137
+ const message = error instanceof Error ? error.message : String(error);
138
+ console.error(message);
139
+ process.exitCode = 1;
140
+ }
141
+ }
142
+
143
+ main();
@@ -0,0 +1,30 @@
1
+ export interface FrontmatterInput {
2
+ title?: string;
3
+ description?: string;
4
+ author?: string;
5
+ date?: string;
6
+ source: string;
7
+ }
8
+
9
+ export function buildFrontmatter(meta: FrontmatterInput): string {
10
+ const lines = ["---"];
11
+ if (meta.title) {
12
+ lines.push(`title: "${escapeFrontmatter(meta.title)}"`);
13
+ }
14
+ if (meta.description) {
15
+ lines.push(`description: "${escapeFrontmatter(meta.description)}"`);
16
+ }
17
+ if (meta.author) {
18
+ lines.push(`author: "${escapeFrontmatter(meta.author)}"`);
19
+ }
20
+ if (meta.date) {
21
+ lines.push(`date: "${escapeFrontmatter(meta.date)}"`);
22
+ }
23
+ lines.push(`source: "${escapeFrontmatter(meta.source)}"`);
24
+ lines.push("---");
25
+ return lines.join("\n");
26
+ }
27
+
28
+ function escapeFrontmatter(value: string): string {
29
+ return value.replaceAll('"', String.raw`\"`);
30
+ }
package/src/tables.ts ADDED
@@ -0,0 +1,80 @@
1
+ import { type Cheerio, type CheerioAPI, load } from "cheerio";
2
+ import type { AnyNode } from "domhandler";
3
+
4
+ import { getBodyHtml } from "./utils";
5
+
6
+ interface TableJson {
7
+ caption?: string;
8
+ headers: string[];
9
+ rows: Record<string, string>[];
10
+ }
11
+
12
+ function extractHeaders($table: Cheerio<AnyNode>, $: CheerioAPI): string[] {
13
+ const explicitHeaders = $table.find("thead th");
14
+ if (explicitHeaders.length) {
15
+ return explicitHeaders
16
+ .toArray()
17
+ .map((th) => $(th).text().trim())
18
+ .filter(Boolean);
19
+ }
20
+
21
+ const firstRowHeaders = $table.find("tr").first().find("th, td");
22
+ if (firstRowHeaders.length) {
23
+ return firstRowHeaders
24
+ .toArray()
25
+ .map((cell, index) => $(cell).text().trim() || `Column ${index + 1}`);
26
+ }
27
+
28
+ return [];
29
+ }
30
+
31
+ function extractRows(
32
+ $table: Cheerio<AnyNode>,
33
+ headers: string[],
34
+ $: CheerioAPI
35
+ ): Record<string, string>[] {
36
+ const rows: Record<string, string>[] = [];
37
+ const dataRows =
38
+ $table.find("tbody tr").length > 0
39
+ ? $table.find("tbody tr")
40
+ : $table.find("tr").slice(1);
41
+
42
+ for (const row of dataRows.toArray()) {
43
+ const cells = $(row).find("td, th");
44
+ if (!cells.length) {
45
+ continue;
46
+ }
47
+ const record: Record<string, string> = {};
48
+ for (const [cellIndex, cell] of cells.toArray().entries()) {
49
+ const key = headers[cellIndex] ?? `Column ${cellIndex + 1}`;
50
+ record[key] = $(cell).text().trim();
51
+ }
52
+ rows.push(record);
53
+ }
54
+
55
+ return rows;
56
+ }
57
+
58
+ export function convertTablesToJson(html: string): string {
59
+ const $ = load(html);
60
+
61
+ for (const table of $("table").toArray()) {
62
+ const $table = $(table);
63
+ const caption = $table.find("caption").first().text().trim() || undefined;
64
+ const headers = extractHeaders($table, $);
65
+ const rows = extractRows($table, headers, $);
66
+
67
+ const json: TableJson = {
68
+ caption,
69
+ headers,
70
+ rows,
71
+ };
72
+
73
+ const pre = $("<pre>")
74
+ .attr("data-into-md-table", "true")
75
+ .text(JSON.stringify(json, null, 2));
76
+ $table.replaceWith(pre);
77
+ }
78
+
79
+ return getBodyHtml($);
80
+ }
@@ -0,0 +1,10 @@
1
+ declare module "jsdom" {
2
+ export interface JSDOMOptions {
3
+ url?: string;
4
+ }
5
+
6
+ export class JSDOM {
7
+ constructor(html?: string, options?: JSDOMOptions);
8
+ window: Window;
9
+ }
10
+ }
package/src/utils.ts ADDED
@@ -0,0 +1,28 @@
1
+ import type { CheerioAPI } from "cheerio";
2
+
3
+ /**
4
+ * Converts a relative URL to an absolute URL using the provided base URL.
5
+ * Returns the original URL if it cannot be parsed.
6
+ */
7
+ export const toAbsoluteUrl = (
8
+ url: string | undefined,
9
+ baseUrl: string
10
+ ): string | undefined => {
11
+ if (!url) {
12
+ return undefined;
13
+ }
14
+ try {
15
+ return new URL(url, baseUrl).toString();
16
+ } catch {
17
+ return url;
18
+ }
19
+ };
20
+
21
+ /**
22
+ * Extracts the inner HTML from the body element, or falls back to root HTML.
23
+ * Common pattern used across multiple cheerio-based transformations.
24
+ */
25
+ export const getBodyHtml = ($: CheerioAPI): string => {
26
+ const body = $("body");
27
+ return body.length ? (body.html() ?? "") : ($.root().html() ?? "");
28
+ };
package/tsconfig.json ADDED
@@ -0,0 +1,29 @@
1
+ {
2
+ "compilerOptions": {
3
+ // Environment setup & latest features
4
+ "lib": ["ESNext", "DOM"],
5
+ "target": "ESNext",
6
+ "module": "Preserve",
7
+ "moduleDetection": "force",
8
+ "jsx": "react-jsx",
9
+ "allowJs": true,
10
+
11
+ // Bundler mode
12
+ "moduleResolution": "bundler",
13
+ "allowImportingTsExtensions": true,
14
+ "verbatimModuleSyntax": true,
15
+ "noEmit": true,
16
+
17
+ // Best practices
18
+ "strict": true,
19
+ "skipLibCheck": true,
20
+ "noFallthroughCasesInSwitch": true,
21
+ "noUncheckedIndexedAccess": true,
22
+ "noImplicitOverride": true,
23
+
24
+ // Some stricter flags (disabled by default)
25
+ "noUnusedLocals": false,
26
+ "noUnusedParameters": false,
27
+ "noPropertyAccessFromIndexSignature": false
28
+ }
29
+ }
@@ -0,0 +1,14 @@
1
+ import { defineConfig } from "tsdown";
2
+
3
+ export default defineConfig({
4
+ banner: "#!/usr/bin/env node\n",
5
+ clean: true,
6
+ dts: true,
7
+ entry: ["src/index.ts"],
8
+ format: ["esm"],
9
+ hash: false,
10
+ outDir: "dist",
11
+ platform: "node",
12
+ sourcemap: true,
13
+ target: "node18",
14
+ });