into-md 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/docs/SPEC.md ADDED
@@ -0,0 +1,201 @@
1
+ # into-md
2
+
3
+ A CLI tool that fetches web pages and converts them to clean markdown, optimized for providing context to LLMs.
4
+
5
+ ## Overview
6
+
7
+ `into-md` fetches a single URL, extracts the main content using readability heuristics, and outputs clean markdown suitable for LLM consumption. It preserves images with context, converts tables to structured JSON, and includes standard metadata.
8
+
9
+ ## Installation
10
+
11
+ ```bash
12
+ npm install -g into-md
13
+ # or
14
+ bunx into-md <url>
15
+ ```
16
+
17
+ ## Usage
18
+
19
+ ```bash
20
+ into-md <url> [options]
21
+ ```
22
+
23
+ ### Examples
24
+
25
+ ```bash
26
+ # Output to stdout
27
+ into-md https://example.com/article
28
+
29
+ # Save to file
30
+ into-md https://example.com/article -o article.md
31
+
32
+ # Use headless browser for JS-rendered content
33
+ into-md https://spa-site.com/page --js
34
+
35
+ # Skip content extraction, convert full page
36
+ into-md https://example.com --raw
37
+
38
+ # With authentication cookies
39
+ into-md https://private-site.com/page --cookies cookies.txt
40
+
41
+ # Verbose output
42
+ into-md https://example.com/article -v
43
+ ```
44
+
45
+ ## Options
46
+
47
+ | Flag | Description | Default |
48
+ | ----------------------- | --------------------------------------------------------- | --------------- |
49
+ | `-o, --output <file>` | Write output to file instead of stdout | stdout |
50
+ | `--js` | Use headless browser (Playwright) for JS-rendered content | disabled |
51
+ | `--raw` | Skip content extraction, convert entire HTML | disabled |
52
+ | `--cookies <file>` | Path to cookies file for authenticated requests | none |
53
+ | `--user-agent <string>` | Custom User-Agent header | browser-like UA |
54
+ | `--encoding <encoding>` | Force character encoding (auto-detected by default) | auto |
55
+ | `--strip-links` | Remove hyperlinks, keep only anchor text | disabled |
56
+ | `--exclude <selectors>` | CSS selectors to exclude (comma-separated) | none |
57
+ | `--timeout <ms>` | Request timeout in milliseconds | 30000 |
58
+ | `--no-cache` | Bypass response cache | cache enabled |
59
+ | `-v, --verbose` | Show detailed progress information | minimal |
60
+ | `-h, --help` | Show help | - |
61
+ | `--version` | Show version | - |
62
+
63
+ ## Output Format
64
+
65
+ ### Frontmatter
66
+
67
+ Standard metadata is included as YAML frontmatter when available:
68
+
69
+ ```yaml
70
+ ---
71
+ title: "Article Title"
72
+ description: "Meta description from the page"
73
+ author: "Author Name"
74
+ date: "2024-01-15"
75
+ source: "https://example.com/article"
76
+ ---
77
+ ```
78
+
79
+ ### Content Structure
80
+
81
+ - **Headings**: Preserved as-is from source (original hierarchy maintained)
82
+ - **Text formatting**: Semantic formatting preserved (bold, italic, strikethrough); decorative formatting (colors, underlines) stripped
83
+ - **Links**: Preserved as markdown links by default; all relative URLs converted to absolute
84
+ - **Code blocks**: Language auto-detected and tagged for syntax highlighting
85
+
86
+ ### Images
87
+
88
+ Images include alt text, URL, and surrounding context:
89
+
90
+ ```markdown
91
+ ![Diagram showing the system architecture](https://example.com/images/arch.png)
92
+ _Figure 1: The system uses a microservices architecture with three main components._
93
+ ```
94
+
95
+ ### Tables
96
+
97
+ Tables are converted to fenced JSON blocks for reliable LLM parsing:
98
+
99
+ ```json
100
+ {
101
+ "caption": "Quarterly Revenue",
102
+ "headers": ["Quarter", "Revenue", "Growth"],
103
+ "rows": [
104
+ { "Quarter": "Q1", "Revenue": "$1.2M", "Growth": "12%" },
105
+ { "Quarter": "Q2", "Revenue": "$1.5M", "Growth": "25%" }
106
+ ]
107
+ }
108
+ ```
109
+
110
+ ### Embedded Content
111
+
112
+ Embeds (iframes, videos, tweets) are replaced with links:
113
+
114
+ ```markdown
115
+ [Embedded video: https://youtube.com/watch?v=xyz123]
116
+ ```
117
+
118
+ ## Content Extraction
119
+
120
+ By default, `into-md` uses readability-style heuristics to:
121
+
122
+ - Extract main article/content area
123
+ - Remove navigation, headers, footers, sidebars
124
+ - Strip ads, cookie banners, and promotional content
125
+ - Filter out irrelevant widgets and scripts
126
+
127
+ Use `--exclude` to fine-tune extraction with additional CSS selectors:
128
+
129
+ ```bash
130
+ into-md https://example.com --exclude ".comments, .related-posts, #newsletter-signup"
131
+ ```
132
+
133
+ Use `--raw` to bypass extraction and convert the entire page.
134
+
135
+ ## Caching
136
+
137
+ Responses are cached locally by default to avoid redundant fetches. Cache location: `~/.cache/into-md/`
138
+
139
+ - Default TTL: 1 hour
140
+ - Use `--no-cache` to fetch fresh content
141
+ - Cache is keyed by URL
142
+
143
+ ## Size Warnings
144
+
145
+ If the output exceeds 100KB, a warning is printed to stderr:
146
+
147
+ ```
148
+ Warning: Output is 156KB. Large documents may exceed LLM context limits.
149
+ ```
150
+
151
+ ## Authentication
152
+
153
+ For pages requiring authentication, export cookies from your browser and pass them via `--cookies`:
154
+
155
+ ```bash
156
+ into-md https://private-docs.company.com/page --cookies ~/cookies.txt
157
+ ```
158
+
159
+ Cookie file format: Netscape/Mozilla cookie file format (compatible with browser extensions like EditThisCookie).
160
+
161
+ ## Error Handling
162
+
163
+ - **403/Blocked**: Clear error message suggesting `--user-agent` option
164
+ - **Timeouts**: Respects `--timeout` flag, defaults to 30 seconds
165
+ - **Encoding issues**: Auto-detects from headers/meta, converts to UTF-8; use `--encoding` to override
166
+
167
+ ## Technical Stack
168
+
169
+ - **Runtime**: Bun
170
+ - **Language**: TypeScript
171
+ - **HTML Parsing**: cheerio
172
+ - **Markdown Conversion**: turndown
173
+ - **Content Extraction**: @mozilla/readability
174
+ - **Headless Browser**: playwright (optional, for `--js` mode)
175
+ - **CLI Framework**: commander or yargs
176
+
177
+ ## Project Structure
178
+
179
+ ```
180
+ into-md/
181
+ ├── src/
182
+ │ ├── index.ts # CLI entry point
183
+ │ ├── fetcher.ts # URL fetching (static + headless)
184
+ │ ├── extractor.ts # Content extraction with readability
185
+ │ ├── converter.ts # HTML to markdown conversion
186
+ │ ├── tables.ts # Table to JSON conversion
187
+ │ ├── images.ts # Image context extraction
188
+ │ ├── metadata.ts # Frontmatter generation
189
+ │ └── cache.ts # Response caching
190
+ ├── package.json
191
+ ├── tsconfig.json
192
+ └── SPEC.md
193
+ ```
194
+
195
+ ## Future Considerations (Out of Scope for v1)
196
+
197
+ - Batch processing of multiple URLs
198
+ - Same-domain crawling with depth control
199
+ - Config file for persistent preferences
200
+ - Prebuilt binaries via GitHub releases
201
+ - Full authentication support (headers, basic auth)
package/package.json ADDED
@@ -0,0 +1,39 @@
1
+ {
2
+ "name": "into-md",
3
+ "private": false,
4
+ "bin": {
5
+ "into-md": "dist/index.mjs"
6
+ },
7
+ "type": "module",
8
+ "module": "src/index.ts",
9
+ "scripts": {
10
+ "start": "bun run src/index.ts",
11
+ "build": "tsdown",
12
+ "build:watch": "tsdown --watch",
13
+ "test": "bun test",
14
+ "lint": "ultracite check",
15
+ "fix": "ultracite fix",
16
+ "fix:unsafe": "ultracite fix --unsafe",
17
+ "typecheck": "tsc --noEmit"
18
+ },
19
+ "dependencies": {
20
+ "@mozilla/readability": "^0.5.0",
21
+ "@types/turndown": "^5.0.6",
22
+ "cheerio": "^1.0.0",
23
+ "commander": "^12.1.0",
24
+ "jsdom": "^24.1.0",
25
+ "playwright": "^1.42.1",
26
+ "turndown": "^7.2.0"
27
+ },
28
+ "devDependencies": {
29
+ "@biomejs/biome": "2.3.11",
30
+ "@types/bun": "latest",
31
+ "oxlint": "^1.38.0",
32
+ "tsdown": "^0.19.0",
33
+ "ultracite": "7.0.11"
34
+ },
35
+ "peerDependencies": {
36
+ "typescript": "^5"
37
+ },
38
+ "version": "0.1.0"
39
+ }
package/src/cache.ts ADDED
@@ -0,0 +1,79 @@
1
+ import { createHash } from "node:crypto";
2
+ import { mkdir, readFile, stat, writeFile } from "node:fs/promises";
3
+ import { dirname, join } from "node:path";
4
+
5
+ export interface CacheOptions {
6
+ enabled: boolean;
7
+ ttlMs: number;
8
+ cacheDir?: string;
9
+ }
10
+
11
+ export interface CachedResponse {
12
+ url: string;
13
+ fetchedAt: number;
14
+ content: string;
15
+ }
16
+
17
+ const defaultCacheDir = join(
18
+ process.env.HOME ?? process.cwd(),
19
+ ".cache",
20
+ "into-md"
21
+ );
22
+
23
+ const DEFAULT_TTL_MS = 60 * 60 * 1000;
24
+
25
+ const buildCachePath = (url: string, cacheDir = defaultCacheDir): string => {
26
+ const hash = createHash("sha256").update(url).digest("hex");
27
+ return join(cacheDir, `${hash}.json`);
28
+ };
29
+
30
+ export async function readFromCache(
31
+ url: string,
32
+ options?: Partial<CacheOptions>
33
+ ): Promise<CachedResponse | null> {
34
+ const {
35
+ enabled = true,
36
+ ttlMs = DEFAULT_TTL_MS,
37
+ cacheDir = defaultCacheDir,
38
+ } = options ?? {};
39
+
40
+ if (!enabled) {
41
+ return null;
42
+ }
43
+
44
+ const target = buildCachePath(url, cacheDir);
45
+ try {
46
+ const [file, info] = await Promise.all([
47
+ readFile(target, "utf8"),
48
+ stat(target),
49
+ ]);
50
+ const payload = JSON.parse(file) as CachedResponse;
51
+ const isFresh = info.mtimeMs + ttlMs > Date.now();
52
+ if (!isFresh) {
53
+ return null;
54
+ }
55
+ if (payload.url !== url) {
56
+ return null;
57
+ }
58
+ return payload;
59
+ } catch {
60
+ return null;
61
+ }
62
+ }
63
+
64
+ export async function writeToCache(
65
+ url: string,
66
+ content: string,
67
+ options?: Partial<CacheOptions>
68
+ ): Promise<void> {
69
+ const { enabled = true, cacheDir = defaultCacheDir } = options ?? {};
70
+
71
+ if (!enabled) {
72
+ return;
73
+ }
74
+
75
+ const target = buildCachePath(url, cacheDir);
76
+ await mkdir(dirname(target), { recursive: true });
77
+ const payload: CachedResponse = { content, fetchedAt: Date.now(), url };
78
+ await writeFile(target, JSON.stringify(payload, null, 2), "utf8");
79
+ }
@@ -0,0 +1,96 @@
1
+ import { load } from "cheerio";
2
+ import TurndownService from "turndown";
3
+
4
+ import { getBodyHtml, toAbsoluteUrl } from "./utils";
5
+
6
+ export interface ConvertOptions {
7
+ baseUrl: string;
8
+ stripLinks?: boolean;
9
+ }
10
+
11
+ function prepareDom(html: string, baseUrl: string): string {
12
+ const $ = load(html);
13
+
14
+ for (const el of $("a[href]").toArray()) {
15
+ const $el = $(el);
16
+ const absolute = toAbsoluteUrl($el.attr("href"), baseUrl);
17
+ if (absolute) {
18
+ $el.attr("href", absolute);
19
+ }
20
+ }
21
+
22
+ for (const el of $("img[src]").toArray()) {
23
+ const $el = $(el);
24
+ const absolute = toAbsoluteUrl($el.attr("src"), baseUrl);
25
+ if (absolute) {
26
+ $el.attr("src", absolute);
27
+ }
28
+ }
29
+
30
+ $("script, style").remove();
31
+ return getBodyHtml($);
32
+ }
33
+
34
+ export function convertHtmlToMarkdown(
35
+ html: string,
36
+ options: ConvertOptions
37
+ ): string {
38
+ const prepared = prepareDom(html, options.baseUrl);
39
+ const turndown = new TurndownService({
40
+ bulletListMarker: "-",
41
+ codeBlockStyle: "fenced",
42
+ headingStyle: "atx",
43
+ });
44
+
45
+ turndown.addRule("stripLinks", {
46
+ filter: "a",
47
+ replacement: (content, node) => {
48
+ if (options.stripLinks) {
49
+ return content;
50
+ }
51
+ const href = (node as HTMLElement).getAttribute("href");
52
+ if (!href) {
53
+ return content;
54
+ }
55
+ return `[${content}](${href})`;
56
+ },
57
+ });
58
+
59
+ turndown.addRule("imagesWithCaption", {
60
+ filter: "img",
61
+ replacement: (_, node) => {
62
+ const element = node as HTMLElement;
63
+ const src = element.getAttribute("src") ?? "";
64
+ const alt = element.getAttribute("alt") ?? "";
65
+ const caption = element.getAttribute("data-into-md-caption");
66
+ const imageLine = `![${alt}](${src})`;
67
+ if (caption) {
68
+ return `${imageLine}\n*${caption}*`;
69
+ }
70
+ return imageLine;
71
+ },
72
+ });
73
+
74
+ turndown.addRule("tableJson", {
75
+ filter: (node) =>
76
+ node.nodeName === "PRE" &&
77
+ (node as HTMLElement).getAttribute("data-into-md-table") === "true",
78
+ replacement: (_content, node) => {
79
+ const text = (node as HTMLElement).textContent?.trim() ?? "";
80
+ return `\`\`\`json\n${text}\n\`\`\``;
81
+ },
82
+ });
83
+
84
+ turndown.addRule("embeds", {
85
+ filter: ["iframe", "embed", "video"],
86
+ replacement: (_, node) => {
87
+ const src = (node as HTMLElement).getAttribute("src") ?? "";
88
+ if (!src) {
89
+ return "";
90
+ }
91
+ return `[Embedded content: ${src}]`;
92
+ },
93
+ });
94
+
95
+ return turndown.turndown(prepared);
96
+ }
@@ -0,0 +1,85 @@
1
+ import { Readability } from "@mozilla/readability";
2
+ import { JSDOM } from "jsdom";
3
+
4
+ export interface ExtractOptions {
5
+ raw?: boolean;
6
+ excludeSelectors?: string[];
7
+ baseUrl: string;
8
+ }
9
+
10
+ export interface ExtractedContent {
11
+ html: string;
12
+ metadata: {
13
+ title?: string;
14
+ description?: string;
15
+ author?: string;
16
+ source: string;
17
+ };
18
+ }
19
+
20
+ function removeNodes(document: Document, selectors: string[]) {
21
+ for (const selector of selectors) {
22
+ for (const node of Array.from(document.querySelectorAll(selector))) {
23
+ node.remove();
24
+ }
25
+ }
26
+ }
27
+
28
+ function extractMetadata(document: Document, source: string) {
29
+ const title =
30
+ document.querySelector("title")?.textContent ??
31
+ document
32
+ .querySelector('meta[property="og:title"]')
33
+ ?.getAttribute("content") ??
34
+ undefined;
35
+
36
+ const description =
37
+ document
38
+ .querySelector('meta[name="description"]')
39
+ ?.getAttribute("content") ??
40
+ document
41
+ .querySelector('meta[property="og:description"]')
42
+ ?.getAttribute("content") ??
43
+ undefined;
44
+
45
+ const author =
46
+ document.querySelector('meta[name="author"]')?.getAttribute("content") ??
47
+ document
48
+ .querySelector('meta[property="article:author"]')
49
+ ?.getAttribute("content") ??
50
+ undefined;
51
+
52
+ return { author, description, source, title: title ?? undefined };
53
+ }
54
+
55
+ export function extractContent(
56
+ html: string,
57
+ { raw = false, excludeSelectors = [], baseUrl }: ExtractOptions
58
+ ): ExtractedContent {
59
+ const dom = new JSDOM(html, { url: baseUrl });
60
+ const { document } = dom.window;
61
+
62
+ if (excludeSelectors.length) {
63
+ removeNodes(document, excludeSelectors);
64
+ }
65
+
66
+ if (raw) {
67
+ const metadata = extractMetadata(document, baseUrl);
68
+ return { html: document.documentElement.outerHTML, metadata };
69
+ }
70
+
71
+ const clone = document.cloneNode(true) as Document;
72
+ const reader = new Readability(clone);
73
+ const article = reader.parse();
74
+
75
+ const contentHtml =
76
+ article?.content ?? document.querySelector("body")?.innerHTML ?? "";
77
+ const metadata = extractMetadata(document, baseUrl);
78
+ if (article?.title && !metadata.title) {
79
+ metadata.title = article.title;
80
+ }
81
+ if (article?.byline && !metadata.author) {
82
+ metadata.author = article.byline;
83
+ }
84
+ return { html: contentHtml, metadata };
85
+ }