into-md 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/CLAUDE.md +251 -0
- package/.claude/settings.json +15 -0
- package/.claude/settings.local.json +9 -0
- package/.cursor/hooks.json +10 -0
- package/.vscode/settings.json +53 -0
- package/AGENTS.md +284 -0
- package/CLAUDE.md +111 -0
- package/GEMINI.md +123 -0
- package/README.md +133 -0
- package/biome.jsonc +4 -0
- package/bun.lock +413 -0
- package/dist/index.d.mts +3 -0
- package/dist/index.mjs +446 -0
- package/dist/index.mjs.map +1 -0
- package/docs/SPEC.md +201 -0
- package/package.json +39 -0
- package/src/cache.ts +79 -0
- package/src/converter.ts +96 -0
- package/src/extractor.ts +85 -0
- package/src/fetcher.ts +236 -0
- package/src/images.ts +27 -0
- package/src/index.ts +143 -0
- package/src/metadata.ts +30 -0
- package/src/tables.ts +80 -0
- package/src/types/jsdom.d.ts +10 -0
- package/src/utils.ts +28 -0
- package/tsconfig.json +29 -0
- package/tsdown.config.ts +14 -0
package/docs/SPEC.md
ADDED
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
# into-md
|
|
2
|
+
|
|
3
|
+
A CLI tool that fetches web pages and converts them to clean markdown, optimized for providing context to LLMs.
|
|
4
|
+
|
|
5
|
+
## Overview
|
|
6
|
+
|
|
7
|
+
`into-md` fetches a single URL, extracts the main content using readability heuristics, and outputs clean markdown suitable for LLM consumption. It preserves images with context, converts tables to structured JSON, and includes standard metadata.
|
|
8
|
+
|
|
9
|
+
## Installation
|
|
10
|
+
|
|
11
|
+
```bash
|
|
12
|
+
npm install -g into-md
|
|
13
|
+
# or
|
|
14
|
+
bunx into-md <url>
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
## Usage
|
|
18
|
+
|
|
19
|
+
```bash
|
|
20
|
+
into-md <url> [options]
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
### Examples
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
# Output to stdout
|
|
27
|
+
into-md https://example.com/article
|
|
28
|
+
|
|
29
|
+
# Save to file
|
|
30
|
+
into-md https://example.com/article -o article.md
|
|
31
|
+
|
|
32
|
+
# Use headless browser for JS-rendered content
|
|
33
|
+
into-md https://spa-site.com/page --js
|
|
34
|
+
|
|
35
|
+
# Skip content extraction, convert full page
|
|
36
|
+
into-md https://example.com --raw
|
|
37
|
+
|
|
38
|
+
# With authentication cookies
|
|
39
|
+
into-md https://private-site.com/page --cookies cookies.txt
|
|
40
|
+
|
|
41
|
+
# Verbose output
|
|
42
|
+
into-md https://example.com/article -v
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
## Options
|
|
46
|
+
|
|
47
|
+
| Flag | Description | Default |
|
|
48
|
+
| ----------------------- | --------------------------------------------------------- | --------------- |
|
|
49
|
+
| `-o, --output <file>` | Write output to file instead of stdout | stdout |
|
|
50
|
+
| `--js` | Use headless browser (Playwright) for JS-rendered content | disabled |
|
|
51
|
+
| `--raw` | Skip content extraction, convert entire HTML | disabled |
|
|
52
|
+
| `--cookies <file>` | Path to cookies file for authenticated requests | none |
|
|
53
|
+
| `--user-agent <string>` | Custom User-Agent header | browser-like UA |
|
|
54
|
+
| `--encoding <encoding>` | Force character encoding (auto-detected by default) | auto |
|
|
55
|
+
| `--strip-links` | Remove hyperlinks, keep only anchor text | disabled |
|
|
56
|
+
| `--exclude <selectors>` | CSS selectors to exclude (comma-separated) | none |
|
|
57
|
+
| `--timeout <ms>` | Request timeout in milliseconds | 30000 |
|
|
58
|
+
| `--no-cache` | Bypass response cache | cache enabled |
|
|
59
|
+
| `-v, --verbose` | Show detailed progress information | minimal |
|
|
60
|
+
| `-h, --help` | Show help | - |
|
|
61
|
+
| `--version` | Show version | - |
|
|
62
|
+
|
|
63
|
+
## Output Format
|
|
64
|
+
|
|
65
|
+
### Frontmatter
|
|
66
|
+
|
|
67
|
+
Standard metadata is included as YAML frontmatter when available:
|
|
68
|
+
|
|
69
|
+
```yaml
|
|
70
|
+
---
|
|
71
|
+
title: "Article Title"
|
|
72
|
+
description: "Meta description from the page"
|
|
73
|
+
author: "Author Name"
|
|
74
|
+
date: "2024-01-15"
|
|
75
|
+
source: "https://example.com/article"
|
|
76
|
+
---
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
### Content Structure
|
|
80
|
+
|
|
81
|
+
- **Headings**: Preserved as-is from source (original hierarchy maintained)
|
|
82
|
+
- **Text formatting**: Semantic formatting preserved (bold, italic, strikethrough); decorative formatting (colors, underlines) stripped
|
|
83
|
+
- **Links**: Preserved as markdown links by default; all relative URLs converted to absolute
|
|
84
|
+
- **Code blocks**: Language auto-detected and tagged for syntax highlighting
|
|
85
|
+
|
|
86
|
+
### Images
|
|
87
|
+
|
|
88
|
+
Images include alt text, URL, and surrounding context:
|
|
89
|
+
|
|
90
|
+
```markdown
|
|
91
|
+

|
|
92
|
+
_Figure 1: The system uses a microservices architecture with three main components._
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
### Tables
|
|
96
|
+
|
|
97
|
+
Tables are converted to fenced JSON blocks for reliable LLM parsing:
|
|
98
|
+
|
|
99
|
+
```json
|
|
100
|
+
{
|
|
101
|
+
"caption": "Quarterly Revenue",
|
|
102
|
+
"headers": ["Quarter", "Revenue", "Growth"],
|
|
103
|
+
"rows": [
|
|
104
|
+
{ "Quarter": "Q1", "Revenue": "$1.2M", "Growth": "12%" },
|
|
105
|
+
{ "Quarter": "Q2", "Revenue": "$1.5M", "Growth": "25%" }
|
|
106
|
+
]
|
|
107
|
+
}
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
### Embedded Content
|
|
111
|
+
|
|
112
|
+
Embeds (iframes, videos, tweets) are replaced with links:
|
|
113
|
+
|
|
114
|
+
```markdown
|
|
115
|
+
[Embedded video: https://youtube.com/watch?v=xyz123]
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
## Content Extraction
|
|
119
|
+
|
|
120
|
+
By default, `into-md` uses readability-style heuristics to:
|
|
121
|
+
|
|
122
|
+
- Extract main article/content area
|
|
123
|
+
- Remove navigation, headers, footers, sidebars
|
|
124
|
+
- Strip ads, cookie banners, and promotional content
|
|
125
|
+
- Filter out irrelevant widgets and scripts
|
|
126
|
+
|
|
127
|
+
Use `--exclude` to fine-tune extraction with additional CSS selectors:
|
|
128
|
+
|
|
129
|
+
```bash
|
|
130
|
+
into-md https://example.com --exclude ".comments, .related-posts, #newsletter-signup"
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
Use `--raw` to bypass extraction and convert the entire page.
|
|
134
|
+
|
|
135
|
+
## Caching
|
|
136
|
+
|
|
137
|
+
Responses are cached locally by default to avoid redundant fetches. Cache location: `~/.cache/into-md/`
|
|
138
|
+
|
|
139
|
+
- Default TTL: 1 hour
|
|
140
|
+
- Use `--no-cache` to fetch fresh content
|
|
141
|
+
- Cache is keyed by URL
|
|
142
|
+
|
|
143
|
+
## Size Warnings
|
|
144
|
+
|
|
145
|
+
If the output exceeds 100KB, a warning is printed to stderr:
|
|
146
|
+
|
|
147
|
+
```
|
|
148
|
+
Warning: Output is 156KB. Large documents may exceed LLM context limits.
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
## Authentication
|
|
152
|
+
|
|
153
|
+
For pages requiring authentication, export cookies from your browser and pass them via `--cookies`:
|
|
154
|
+
|
|
155
|
+
```bash
|
|
156
|
+
into-md https://private-docs.company.com/page --cookies ~/cookies.txt
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
Cookie file format: Netscape/Mozilla cookie file format (compatible with browser extensions like EditThisCookie).
|
|
160
|
+
|
|
161
|
+
## Error Handling
|
|
162
|
+
|
|
163
|
+
- **403/Blocked**: Clear error message suggesting `--user-agent` option
|
|
164
|
+
- **Timeouts**: Respects `--timeout` flag, defaults to 30 seconds
|
|
165
|
+
- **Encoding issues**: Auto-detects from headers/meta, converts to UTF-8; use `--encoding` to override
|
|
166
|
+
|
|
167
|
+
## Technical Stack
|
|
168
|
+
|
|
169
|
+
- **Runtime**: Bun
|
|
170
|
+
- **Language**: TypeScript
|
|
171
|
+
- **HTML Parsing**: cheerio
|
|
172
|
+
- **Markdown Conversion**: turndown
|
|
173
|
+
- **Content Extraction**: @mozilla/readability
|
|
174
|
+
- **Headless Browser**: playwright (optional, for `--js` mode)
|
|
175
|
+
- **CLI Framework**: commander or yargs
|
|
176
|
+
|
|
177
|
+
## Project Structure
|
|
178
|
+
|
|
179
|
+
```
|
|
180
|
+
into-md/
|
|
181
|
+
├── src/
|
|
182
|
+
│ ├── index.ts # CLI entry point
|
|
183
|
+
│ ├── fetcher.ts # URL fetching (static + headless)
|
|
184
|
+
│ ├── extractor.ts # Content extraction with readability
|
|
185
|
+
│ ├── converter.ts # HTML to markdown conversion
|
|
186
|
+
│ ├── tables.ts # Table to JSON conversion
|
|
187
|
+
│ ├── images.ts # Image context extraction
|
|
188
|
+
│ ├── metadata.ts # Frontmatter generation
|
|
189
|
+
│ └── cache.ts # Response caching
|
|
190
|
+
├── package.json
|
|
191
|
+
├── tsconfig.json
|
|
192
|
+
└── SPEC.md
|
|
193
|
+
```
|
|
194
|
+
|
|
195
|
+
## Future Considerations (Out of Scope for v1)
|
|
196
|
+
|
|
197
|
+
- Batch processing of multiple URLs
|
|
198
|
+
- Same-domain crawling with depth control
|
|
199
|
+
- Config file for persistent preferences
|
|
200
|
+
- Prebuilt binaries via GitHub releases
|
|
201
|
+
- Full authentication support (headers, basic auth)
|
package/package.json
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "into-md",
|
|
3
|
+
"private": false,
|
|
4
|
+
"bin": {
|
|
5
|
+
"into-md": "dist/index.mjs"
|
|
6
|
+
},
|
|
7
|
+
"type": "module",
|
|
8
|
+
"module": "src/index.ts",
|
|
9
|
+
"scripts": {
|
|
10
|
+
"start": "bun run src/index.ts",
|
|
11
|
+
"build": "tsdown",
|
|
12
|
+
"build:watch": "tsdown --watch",
|
|
13
|
+
"test": "bun test",
|
|
14
|
+
"lint": "ultracite check",
|
|
15
|
+
"fix": "ultracite fix",
|
|
16
|
+
"fix:unsafe": "ultracite fix --unsafe",
|
|
17
|
+
"typecheck": "tsc --noEmit"
|
|
18
|
+
},
|
|
19
|
+
"dependencies": {
|
|
20
|
+
"@mozilla/readability": "^0.5.0",
|
|
21
|
+
"@types/turndown": "^5.0.6",
|
|
22
|
+
"cheerio": "^1.0.0",
|
|
23
|
+
"commander": "^12.1.0",
|
|
24
|
+
"jsdom": "^24.1.0",
|
|
25
|
+
"playwright": "^1.42.1",
|
|
26
|
+
"turndown": "^7.2.0"
|
|
27
|
+
},
|
|
28
|
+
"devDependencies": {
|
|
29
|
+
"@biomejs/biome": "2.3.11",
|
|
30
|
+
"@types/bun": "latest",
|
|
31
|
+
"oxlint": "^1.38.0",
|
|
32
|
+
"tsdown": "^0.19.0",
|
|
33
|
+
"ultracite": "7.0.11"
|
|
34
|
+
},
|
|
35
|
+
"peerDependencies": {
|
|
36
|
+
"typescript": "^5"
|
|
37
|
+
},
|
|
38
|
+
"version": "0.1.0"
|
|
39
|
+
}
|
package/src/cache.ts
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
import { createHash } from "node:crypto";
|
|
2
|
+
import { mkdir, readFile, stat, writeFile } from "node:fs/promises";
|
|
3
|
+
import { dirname, join } from "node:path";
|
|
4
|
+
|
|
5
|
+
export interface CacheOptions {
|
|
6
|
+
enabled: boolean;
|
|
7
|
+
ttlMs: number;
|
|
8
|
+
cacheDir?: string;
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
export interface CachedResponse {
|
|
12
|
+
url: string;
|
|
13
|
+
fetchedAt: number;
|
|
14
|
+
content: string;
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
const defaultCacheDir = join(
|
|
18
|
+
process.env.HOME ?? process.cwd(),
|
|
19
|
+
".cache",
|
|
20
|
+
"into-md"
|
|
21
|
+
);
|
|
22
|
+
|
|
23
|
+
const DEFAULT_TTL_MS = 60 * 60 * 1000;
|
|
24
|
+
|
|
25
|
+
const buildCachePath = (url: string, cacheDir = defaultCacheDir): string => {
|
|
26
|
+
const hash = createHash("sha256").update(url).digest("hex");
|
|
27
|
+
return join(cacheDir, `${hash}.json`);
|
|
28
|
+
};
|
|
29
|
+
|
|
30
|
+
export async function readFromCache(
|
|
31
|
+
url: string,
|
|
32
|
+
options?: Partial<CacheOptions>
|
|
33
|
+
): Promise<CachedResponse | null> {
|
|
34
|
+
const {
|
|
35
|
+
enabled = true,
|
|
36
|
+
ttlMs = DEFAULT_TTL_MS,
|
|
37
|
+
cacheDir = defaultCacheDir,
|
|
38
|
+
} = options ?? {};
|
|
39
|
+
|
|
40
|
+
if (!enabled) {
|
|
41
|
+
return null;
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
const target = buildCachePath(url, cacheDir);
|
|
45
|
+
try {
|
|
46
|
+
const [file, info] = await Promise.all([
|
|
47
|
+
readFile(target, "utf8"),
|
|
48
|
+
stat(target),
|
|
49
|
+
]);
|
|
50
|
+
const payload = JSON.parse(file) as CachedResponse;
|
|
51
|
+
const isFresh = info.mtimeMs + ttlMs > Date.now();
|
|
52
|
+
if (!isFresh) {
|
|
53
|
+
return null;
|
|
54
|
+
}
|
|
55
|
+
if (payload.url !== url) {
|
|
56
|
+
return null;
|
|
57
|
+
}
|
|
58
|
+
return payload;
|
|
59
|
+
} catch {
|
|
60
|
+
return null;
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
export async function writeToCache(
|
|
65
|
+
url: string,
|
|
66
|
+
content: string,
|
|
67
|
+
options?: Partial<CacheOptions>
|
|
68
|
+
): Promise<void> {
|
|
69
|
+
const { enabled = true, cacheDir = defaultCacheDir } = options ?? {};
|
|
70
|
+
|
|
71
|
+
if (!enabled) {
|
|
72
|
+
return;
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
const target = buildCachePath(url, cacheDir);
|
|
76
|
+
await mkdir(dirname(target), { recursive: true });
|
|
77
|
+
const payload: CachedResponse = { content, fetchedAt: Date.now(), url };
|
|
78
|
+
await writeFile(target, JSON.stringify(payload, null, 2), "utf8");
|
|
79
|
+
}
|
package/src/converter.ts
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
import { load } from "cheerio";
|
|
2
|
+
import TurndownService from "turndown";
|
|
3
|
+
|
|
4
|
+
import { getBodyHtml, toAbsoluteUrl } from "./utils";
|
|
5
|
+
|
|
6
|
+
export interface ConvertOptions {
|
|
7
|
+
baseUrl: string;
|
|
8
|
+
stripLinks?: boolean;
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
function prepareDom(html: string, baseUrl: string): string {
|
|
12
|
+
const $ = load(html);
|
|
13
|
+
|
|
14
|
+
for (const el of $("a[href]").toArray()) {
|
|
15
|
+
const $el = $(el);
|
|
16
|
+
const absolute = toAbsoluteUrl($el.attr("href"), baseUrl);
|
|
17
|
+
if (absolute) {
|
|
18
|
+
$el.attr("href", absolute);
|
|
19
|
+
}
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
for (const el of $("img[src]").toArray()) {
|
|
23
|
+
const $el = $(el);
|
|
24
|
+
const absolute = toAbsoluteUrl($el.attr("src"), baseUrl);
|
|
25
|
+
if (absolute) {
|
|
26
|
+
$el.attr("src", absolute);
|
|
27
|
+
}
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
$("script, style").remove();
|
|
31
|
+
return getBodyHtml($);
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
export function convertHtmlToMarkdown(
|
|
35
|
+
html: string,
|
|
36
|
+
options: ConvertOptions
|
|
37
|
+
): string {
|
|
38
|
+
const prepared = prepareDom(html, options.baseUrl);
|
|
39
|
+
const turndown = new TurndownService({
|
|
40
|
+
bulletListMarker: "-",
|
|
41
|
+
codeBlockStyle: "fenced",
|
|
42
|
+
headingStyle: "atx",
|
|
43
|
+
});
|
|
44
|
+
|
|
45
|
+
turndown.addRule("stripLinks", {
|
|
46
|
+
filter: "a",
|
|
47
|
+
replacement: (content, node) => {
|
|
48
|
+
if (options.stripLinks) {
|
|
49
|
+
return content;
|
|
50
|
+
}
|
|
51
|
+
const href = (node as HTMLElement).getAttribute("href");
|
|
52
|
+
if (!href) {
|
|
53
|
+
return content;
|
|
54
|
+
}
|
|
55
|
+
return `[${content}](${href})`;
|
|
56
|
+
},
|
|
57
|
+
});
|
|
58
|
+
|
|
59
|
+
turndown.addRule("imagesWithCaption", {
|
|
60
|
+
filter: "img",
|
|
61
|
+
replacement: (_, node) => {
|
|
62
|
+
const element = node as HTMLElement;
|
|
63
|
+
const src = element.getAttribute("src") ?? "";
|
|
64
|
+
const alt = element.getAttribute("alt") ?? "";
|
|
65
|
+
const caption = element.getAttribute("data-into-md-caption");
|
|
66
|
+
const imageLine = ``;
|
|
67
|
+
if (caption) {
|
|
68
|
+
return `${imageLine}\n*${caption}*`;
|
|
69
|
+
}
|
|
70
|
+
return imageLine;
|
|
71
|
+
},
|
|
72
|
+
});
|
|
73
|
+
|
|
74
|
+
turndown.addRule("tableJson", {
|
|
75
|
+
filter: (node) =>
|
|
76
|
+
node.nodeName === "PRE" &&
|
|
77
|
+
(node as HTMLElement).getAttribute("data-into-md-table") === "true",
|
|
78
|
+
replacement: (_content, node) => {
|
|
79
|
+
const text = (node as HTMLElement).textContent?.trim() ?? "";
|
|
80
|
+
return `\`\`\`json\n${text}\n\`\`\``;
|
|
81
|
+
},
|
|
82
|
+
});
|
|
83
|
+
|
|
84
|
+
turndown.addRule("embeds", {
|
|
85
|
+
filter: ["iframe", "embed", "video"],
|
|
86
|
+
replacement: (_, node) => {
|
|
87
|
+
const src = (node as HTMLElement).getAttribute("src") ?? "";
|
|
88
|
+
if (!src) {
|
|
89
|
+
return "";
|
|
90
|
+
}
|
|
91
|
+
return `[Embedded content: ${src}]`;
|
|
92
|
+
},
|
|
93
|
+
});
|
|
94
|
+
|
|
95
|
+
return turndown.turndown(prepared);
|
|
96
|
+
}
|
package/src/extractor.ts
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
import { Readability } from "@mozilla/readability";
|
|
2
|
+
import { JSDOM } from "jsdom";
|
|
3
|
+
|
|
4
|
+
export interface ExtractOptions {
|
|
5
|
+
raw?: boolean;
|
|
6
|
+
excludeSelectors?: string[];
|
|
7
|
+
baseUrl: string;
|
|
8
|
+
}
|
|
9
|
+
|
|
10
|
+
export interface ExtractedContent {
|
|
11
|
+
html: string;
|
|
12
|
+
metadata: {
|
|
13
|
+
title?: string;
|
|
14
|
+
description?: string;
|
|
15
|
+
author?: string;
|
|
16
|
+
source: string;
|
|
17
|
+
};
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
function removeNodes(document: Document, selectors: string[]) {
|
|
21
|
+
for (const selector of selectors) {
|
|
22
|
+
for (const node of Array.from(document.querySelectorAll(selector))) {
|
|
23
|
+
node.remove();
|
|
24
|
+
}
|
|
25
|
+
}
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
function extractMetadata(document: Document, source: string) {
|
|
29
|
+
const title =
|
|
30
|
+
document.querySelector("title")?.textContent ??
|
|
31
|
+
document
|
|
32
|
+
.querySelector('meta[property="og:title"]')
|
|
33
|
+
?.getAttribute("content") ??
|
|
34
|
+
undefined;
|
|
35
|
+
|
|
36
|
+
const description =
|
|
37
|
+
document
|
|
38
|
+
.querySelector('meta[name="description"]')
|
|
39
|
+
?.getAttribute("content") ??
|
|
40
|
+
document
|
|
41
|
+
.querySelector('meta[property="og:description"]')
|
|
42
|
+
?.getAttribute("content") ??
|
|
43
|
+
undefined;
|
|
44
|
+
|
|
45
|
+
const author =
|
|
46
|
+
document.querySelector('meta[name="author"]')?.getAttribute("content") ??
|
|
47
|
+
document
|
|
48
|
+
.querySelector('meta[property="article:author"]')
|
|
49
|
+
?.getAttribute("content") ??
|
|
50
|
+
undefined;
|
|
51
|
+
|
|
52
|
+
return { author, description, source, title: title ?? undefined };
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
export function extractContent(
|
|
56
|
+
html: string,
|
|
57
|
+
{ raw = false, excludeSelectors = [], baseUrl }: ExtractOptions
|
|
58
|
+
): ExtractedContent {
|
|
59
|
+
const dom = new JSDOM(html, { url: baseUrl });
|
|
60
|
+
const { document } = dom.window;
|
|
61
|
+
|
|
62
|
+
if (excludeSelectors.length) {
|
|
63
|
+
removeNodes(document, excludeSelectors);
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
if (raw) {
|
|
67
|
+
const metadata = extractMetadata(document, baseUrl);
|
|
68
|
+
return { html: document.documentElement.outerHTML, metadata };
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
const clone = document.cloneNode(true) as Document;
|
|
72
|
+
const reader = new Readability(clone);
|
|
73
|
+
const article = reader.parse();
|
|
74
|
+
|
|
75
|
+
const contentHtml =
|
|
76
|
+
article?.content ?? document.querySelector("body")?.innerHTML ?? "";
|
|
77
|
+
const metadata = extractMetadata(document, baseUrl);
|
|
78
|
+
if (article?.title && !metadata.title) {
|
|
79
|
+
metadata.title = article.title;
|
|
80
|
+
}
|
|
81
|
+
if (article?.byline && !metadata.author) {
|
|
82
|
+
metadata.author = article.byline;
|
|
83
|
+
}
|
|
84
|
+
return { html: contentHtml, metadata };
|
|
85
|
+
}
|