web-to-markdown 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,222 @@
1
+ interface ConvertOptions {
2
+ /** Force headless browser rendering (for SPAs). */
3
+ readonly browser: boolean;
4
+ /** Convert full HTML without content extraction. */
5
+ readonly raw: boolean;
6
+ /** Include YAML frontmatter with metadata. */
7
+ readonly frontmatter: boolean;
8
+ /** Strip images from output. */
9
+ readonly noImages: boolean;
10
+ /** Timeout in milliseconds for page loading. */
11
+ readonly timeout: number;
12
+ /** Output file path (undefined means stdout). */
13
+ readonly output?: string;
14
+ }
15
+ interface FetchResult {
16
+ /** The raw HTML string of the page. */
17
+ readonly html: string;
18
+ /** The final URL after any redirects. */
19
+ readonly finalUrl: string;
20
+ }
21
+ interface PageMetadata {
22
+ readonly title: string | null;
23
+ readonly byline: string | null;
24
+ readonly excerpt: string | null;
25
+ readonly siteName: string | null;
26
+ readonly publishedTime: string | null;
27
+ readonly lang: string | null;
28
+ }
29
+ interface ExtractResult {
30
+ /** Cleaned HTML content (article body). */
31
+ readonly content: string;
32
+ /** Metadata about the page. */
33
+ readonly metadata: PageMetadata;
34
+ }
35
+ interface ConvertResult {
36
+ /** The Markdown output string. */
37
+ readonly markdown: string;
38
+ /** Metadata extracted from the page. */
39
+ readonly metadata: PageMetadata;
40
+ /** Non-fatal warnings generated during conversion. */
41
+ readonly warnings: readonly string[];
42
+ }
43
+
44
+ /**
45
+ * Validate a URL for safety and resolve its hostname to an IP.
46
+ *
47
+ * Checks protocol, hostname blocklists, and private IP ranges.
48
+ * Performs a single DNS lookup and returns the resolved IP for
49
+ * pinned connections that prevent DNS rebinding.
50
+ *
51
+ * @param url - The URL string to validate.
52
+ * @returns The resolved IP address to pin connections to.
53
+ * @throws {ValidationError} If the URL is malformed or uses an unsupported protocol.
54
+ * @throws {SSRFError} If the hostname or resolved IP is private/internal.
55
+ * @throws {NetworkError} If DNS resolution fails.
56
+ */
57
+ declare function validateUrl(url: string): Promise<string>;
58
+ /**
59
+ * Fetch a URL and return raw text without Content-Type validation.
60
+ * Uses the same SSRF protections and DNS pinning as fetchWithHttp.
61
+ * Returns null on any failure (network error, non-2xx status, etc.).
62
+ */
63
+ declare function fetchRawText(url: string, timeout: number): Promise<{
64
+ body: string;
65
+ contentType: string;
66
+ } | null>;
67
+ /**
68
+ * Fetch a page using either HTTP or headless browser.
69
+ *
70
+ * Validates the URL for SSRF and resolves DNS once.
71
+ * The resolved IP is pinned to the connection to prevent DNS rebinding.
72
+ *
73
+ * @param url - The URL to fetch.
74
+ * @param options - Fetch options.
75
+ * @param options.browser - If `true`, use Playwright headless browser for SPAs.
76
+ * @param options.timeout - Request timeout in milliseconds (clamped to 1–300 s).
77
+ * @returns The raw HTML and final URL after redirects.
78
+ * @throws {ValidationError} If the URL is invalid.
79
+ * @throws {SSRFError} If the URL targets a private/internal host.
80
+ * @throws {NetworkError} On timeout, DNS failure, or HTTP error.
81
+ * @throws {ContentError} If the response is too large or not HTML.
82
+ */
83
+ declare function fetchPage(url: string, options: Readonly<{
84
+ browser: boolean;
85
+ timeout: number;
86
+ }>): Promise<FetchResult>;
87
+
88
+ /**
89
+ * Extract the main article content from raw HTML using
90
+ * Mozilla Readability + linkedom DOM parsing.
91
+ *
92
+ * @param html - The raw HTML string to process.
93
+ * @param url - The page URL, used for resolving relative links.
94
+ * @returns The extracted content and metadata, or `null` if extraction fails.
95
+ */
96
+ declare function extractContent(html: string, url: string): ExtractResult | null;
97
+
98
+ interface MdxExtractResult {
99
+ /** Clean markdown content (MDX components stripped). */
100
+ readonly markdown: string;
101
+ /** Metadata parsed from frontmatter. */
102
+ readonly metadata: PageMetadata;
103
+ }
104
+ /**
105
+ * Attempt to extract raw MDX content from Next.js RSC (React Server Components)
106
+ * payloads embedded in the HTML. Many Next.js documentation sites ship the full
107
+ * markdown source inside `self.__next_f.push([1,"..."])` script tags. This
108
+ * content includes everything — even collapsed accordion and tab content that
109
+ * never renders to the DOM.
110
+ *
111
+ * @param html - The raw HTML string of the page.
112
+ * @param url - The page URL, used for resolving relative links.
113
+ * @returns Extracted markdown and metadata, or `null` if no RSC MDX found.
114
+ */
115
+ declare function extractMdx(html: string, url: string): MdxExtractResult | null;
116
+ /**
117
+ * Process raw MDX/markdown text: parse frontmatter, strip JSX components,
118
+ * resolve relative URLs, and clean up whitespace.
119
+ *
120
+ * Used for processing .md endpoint responses and raw MDX from RSC chunks.
121
+ */
122
+ declare function processRawMdx(mdx: string, url: string): MdxExtractResult;
123
+
124
+ interface ConverterOptions {
125
+ /** Base URL for resolving relative URLs to absolute. */
126
+ baseUrl?: string;
127
+ /** If true, strip all images from output. */
128
+ stripImages?: boolean;
129
+ }
130
+ /**
131
+ * Convert an HTML string to Markdown.
132
+ *
133
+ * Uses Turndown with GFM extensions. Resolves relative URLs to absolute
134
+ * when `baseUrl` is provided. Filters dangerous protocols and optionally
135
+ * strips images.
136
+ *
137
+ * @param html - The HTML string to convert.
138
+ * @param options - Conversion options.
139
+ * @param options.baseUrl - Base URL for resolving relative links.
140
+ * @param options.stripImages - If `true`, remove all images from output.
141
+ * @returns A Markdown string with trailing newline.
142
+ * @throws {ContentError} If the HTML is too deeply nested to process.
143
+ */
144
+ declare function htmlToMarkdown(html: string, options?: ConverterOptions): string;
145
+
146
+ /**
147
+ * Custom error classes for web-to-markdown.
148
+ *
149
+ * Typed errors enable consumers to handle specific failure modes
150
+ * programmatically (e.g. retry on timeout, show SSRF to user, etc.)
151
+ * instead of parsing error message strings.
152
+ *
153
+ * All errors support the standard `cause` property for error chaining
154
+ * (ES2022 Error Cause), preserving the original error context.
155
+ */
156
+ /** Options accepted by all web-to-markdown error constructors. */
157
+ interface ErrorOptions {
158
+ cause?: unknown;
159
+ }
160
+ /**
161
+ * Base class for all web-to-markdown errors.
162
+ * Consumers can catch this to handle any library error.
163
+ */
164
+ declare class MarkitdownError extends Error {
165
+ constructor(message: string, options?: ErrorOptions);
166
+ }
167
+ /**
168
+ * Thrown when a URL is invalid or uses an unsupported protocol.
169
+ */
170
+ declare class ValidationError extends MarkitdownError {
171
+ constructor(message: string, options?: ErrorOptions);
172
+ }
173
+ /**
174
+ * Thrown when a request is blocked due to SSRF protection
175
+ * (private IPs, internal hostnames, DNS rebinding, etc.)
176
+ */
177
+ declare class SSRFError extends MarkitdownError {
178
+ constructor(message: string, options?: ErrorOptions);
179
+ }
180
+ /**
181
+ * Thrown when a network request fails (timeout, DNS failure, HTTP error, etc.)
182
+ */
183
+ declare class NetworkError extends MarkitdownError {
184
+ readonly statusCode?: number;
185
+ constructor(message: string, statusCode?: number, options?: ErrorOptions);
186
+ }
187
+ /**
188
+ * Thrown when content cannot be processed (too large, too deeply nested, etc.)
189
+ */
190
+ declare class ContentError extends MarkitdownError {
191
+ constructor(message: string, options?: ErrorOptions);
192
+ }
193
+
194
+ /**
195
+ * Convert a URL to Markdown.
196
+ *
197
+ * Primary API for both CLI and programmatic use.
198
+ * Fetches the page, extracts main content via Readability,
199
+ * converts to Markdown, and optionally prepends YAML frontmatter.
200
+ *
201
+ * @param url - The HTTP/HTTPS URL to convert.
202
+ * @param options - Override default conversion options.
203
+ * @returns The Markdown string, page metadata, and any warnings.
204
+ * @throws {ValidationError} If the URL or options are invalid.
205
+ * @throws {SSRFError} If the URL targets a private/internal host.
206
+ * @throws {NetworkError} On timeout, DNS failure, or HTTP error.
207
+ * @throws {ContentError} If the response is too large or malformed.
208
+ *
209
+ * @example
210
+ * ```ts
211
+ * import { convert, NetworkError } from "web-to-markdown";
212
+ *
213
+ * try {
214
+ * const { markdown } = await convert("https://example.com");
215
+ * } catch (err) {
216
+ * if (err instanceof NetworkError) console.error("Network issue:", err.message);
217
+ * }
218
+ * ```
219
+ */
220
+ declare function convert(url: string, options?: Partial<ConvertOptions>): Promise<ConvertResult>;
221
+
222
+ export { ContentError, type ConvertOptions, type ConvertResult, MarkitdownError, NetworkError, type PageMetadata, SSRFError, ValidationError, convert, extractContent, extractMdx, fetchPage, fetchRawText, htmlToMarkdown, processRawMdx, validateUrl };