markit-ai 0.5.1 → 0.5.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/markit.d.ts +15 -0
- package/dist/markit.js +90 -7
- package/package.json +1 -1
package/dist/markit.d.ts
CHANGED
|
@@ -12,8 +12,23 @@ export declare class Markit {
|
|
|
12
12
|
* Convert a URL to markdown.
|
|
13
13
|
*/
|
|
14
14
|
convertUrl(url: string): Promise<ConversionResult>;
|
|
15
|
+
/**
|
|
16
|
+
* Inspect an HTML response for a discoverable markdown source URL.
|
|
17
|
+
* If found, fetch and convert the raw markdown instead.
|
|
18
|
+
*/
|
|
19
|
+
private tryMarkdownSource;
|
|
15
20
|
/**
|
|
16
21
|
* Convert a buffer with stream info to markdown.
|
|
17
22
|
*/
|
|
18
23
|
convert(input: Buffer, streamInfo: StreamInfo): Promise<ConversionResult>;
|
|
19
24
|
}
|
|
25
|
+
/**
|
|
26
|
+
* Try to discover a raw markdown source URL from an HTML response.
|
|
27
|
+
* Checks multiple patterns:
|
|
28
|
+
* 1. <link rel="alternate" type="text/markdown" href="..."> tag
|
|
29
|
+
* 2. VitePress markers → append .md to the URL
|
|
30
|
+
* 3. llms.txt convention → try url.md or url.html.md
|
|
31
|
+
*
|
|
32
|
+
* @internal Exported for testing.
|
|
33
|
+
*/
|
|
34
|
+
export declare function discoverMarkdownSource(html: string, url: string, ext: string): string | null;
|
package/dist/markit.js
CHANGED
|
@@ -19,6 +19,7 @@ import { XlsxConverter } from "./converters/xlsx.js";
|
|
|
19
19
|
import { XmlConverter } from "./converters/xml.js";
|
|
20
20
|
import { YamlConverter } from "./converters/yaml.js";
|
|
21
21
|
import { ZipConverter } from "./converters/zip.js";
|
|
22
|
+
const USER_AGENT = "markit/0.1.0";
|
|
22
23
|
export class Markit {
|
|
23
24
|
converters = [];
|
|
24
25
|
options;
|
|
@@ -89,25 +90,72 @@ export class Markit {
|
|
|
89
90
|
const response = await fetch(url, {
|
|
90
91
|
headers: {
|
|
91
92
|
Accept: "text/markdown, text/html;q=0.9, text/plain;q=0.8, */*;q=0.1",
|
|
92
|
-
"User-Agent":
|
|
93
|
+
"User-Agent": USER_AGENT,
|
|
93
94
|
},
|
|
94
95
|
});
|
|
95
96
|
if (!response.ok) {
|
|
96
97
|
throw new Error(`Failed to fetch ${url}: ${response.status} ${response.statusText}`);
|
|
97
98
|
}
|
|
98
99
|
const contentType = response.headers.get("content-type") || "";
|
|
99
|
-
const
|
|
100
|
-
// Derive extension from URL path
|
|
100
|
+
const mimetype = contentType.split(";")[0].trim();
|
|
101
101
|
const urlPath = new URL(url).pathname;
|
|
102
102
|
const ext = extname(urlPath).toLowerCase();
|
|
103
|
+
// Content negotiation worked — server returned markdown directly
|
|
104
|
+
if (mimetype === "text/markdown") {
|
|
105
|
+
const buffer = Buffer.from(await response.arrayBuffer());
|
|
106
|
+
return this.convert(buffer, {
|
|
107
|
+
url,
|
|
108
|
+
mimetype: "text/markdown",
|
|
109
|
+
extension: ".md",
|
|
110
|
+
filename: basename(urlPath) || undefined,
|
|
111
|
+
});
|
|
112
|
+
}
|
|
103
113
|
const buffer = Buffer.from(await response.arrayBuffer());
|
|
104
|
-
|
|
114
|
+
// For HTML responses, try to discover a raw markdown source.
|
|
115
|
+
// Patterns: <link rel="alternate">, VitePress .md files, llms.txt convention.
|
|
116
|
+
if (mimetype === "text/html") {
|
|
117
|
+
const result = await this.tryMarkdownSource(buffer, url, ext);
|
|
118
|
+
if (result)
|
|
119
|
+
return result;
|
|
120
|
+
}
|
|
121
|
+
return this.convert(buffer, {
|
|
105
122
|
url,
|
|
106
|
-
mimetype
|
|
123
|
+
mimetype,
|
|
107
124
|
extension: ext || undefined,
|
|
108
125
|
filename: basename(urlPath) || undefined,
|
|
109
|
-
};
|
|
110
|
-
|
|
126
|
+
});
|
|
127
|
+
}
|
|
128
|
+
/**
|
|
129
|
+
* Inspect an HTML response for a discoverable markdown source URL.
|
|
130
|
+
* If found, fetch and convert the raw markdown instead.
|
|
131
|
+
*/
|
|
132
|
+
async tryMarkdownSource(htmlBuffer, url, ext) {
|
|
133
|
+
const html = htmlBuffer.toString("utf-8", 0, Math.min(htmlBuffer.length, 50_000));
|
|
134
|
+
const mdSourceUrl = discoverMarkdownSource(html, url, ext);
|
|
135
|
+
if (!mdSourceUrl)
|
|
136
|
+
return null;
|
|
137
|
+
try {
|
|
138
|
+
const response = await fetch(mdSourceUrl, {
|
|
139
|
+
headers: { "User-Agent": USER_AGENT },
|
|
140
|
+
});
|
|
141
|
+
if (!response.ok)
|
|
142
|
+
return null;
|
|
143
|
+
const ct = (response.headers.get("content-type") || "")
|
|
144
|
+
.split(";")[0]
|
|
145
|
+
.trim();
|
|
146
|
+
if (!ct.includes("markdown") && !ct.includes("text/plain"))
|
|
147
|
+
return null;
|
|
148
|
+
const mdBuffer = Buffer.from(await response.arrayBuffer());
|
|
149
|
+
return this.convert(mdBuffer, {
|
|
150
|
+
url: mdSourceUrl,
|
|
151
|
+
mimetype: "text/markdown",
|
|
152
|
+
extension: ".md",
|
|
153
|
+
filename: basename(new URL(mdSourceUrl).pathname),
|
|
154
|
+
});
|
|
155
|
+
}
|
|
156
|
+
catch {
|
|
157
|
+
return null;
|
|
158
|
+
}
|
|
111
159
|
}
|
|
112
160
|
/**
|
|
113
161
|
* Convert a buffer with stream info to markdown.
|
|
@@ -136,3 +184,38 @@ export class Markit {
|
|
|
136
184
|
throw new Error(`Unsupported format: ${streamInfo.extension || streamInfo.mimetype || "unknown"}`);
|
|
137
185
|
}
|
|
138
186
|
}
|
|
187
|
+
/**
|
|
188
|
+
* Try to discover a raw markdown source URL from an HTML response.
|
|
189
|
+
* Checks multiple patterns:
|
|
190
|
+
* 1. <link rel="alternate" type="text/markdown" href="..."> tag
|
|
191
|
+
* 2. VitePress markers → append .md to the URL
|
|
192
|
+
* 3. llms.txt convention → try url.md or url.html.md
|
|
193
|
+
*
|
|
194
|
+
* @internal Exported for testing.
|
|
195
|
+
*/
|
|
196
|
+
export function discoverMarkdownSource(html, url, ext) {
|
|
197
|
+
// 1. Look for <link rel="alternate" type="text/markdown" href="...">
|
|
198
|
+
const linkMatch = html.match(/<link[^>]+rel=["']alternate["'][^>]+type=["']text\/markdown["'][^>]+href=["']([^"']+)["']/i) ??
|
|
199
|
+
html.match(/<link[^>]+type=["']text\/markdown["'][^>]+rel=["']alternate["'][^>]+href=["']([^"']+)["']/i);
|
|
200
|
+
if (linkMatch?.[1]) {
|
|
201
|
+
try {
|
|
202
|
+
return new URL(linkMatch[1], url).href;
|
|
203
|
+
}
|
|
204
|
+
catch {
|
|
205
|
+
/* ignore malformed URLs */
|
|
206
|
+
}
|
|
207
|
+
}
|
|
208
|
+
// 2. VitePress detection — serves .md alongside HTML
|
|
209
|
+
const isVitePress = html.includes("__VP_HASH_MAP__") ||
|
|
210
|
+
html.includes("VPContent") ||
|
|
211
|
+
html.includes("vitepress");
|
|
212
|
+
// 3. llms.txt convention: try url.md for extensionless URLs
|
|
213
|
+
const hasLlmsTxt = html.includes("llms.txt");
|
|
214
|
+
if (!ext && (isVitePress || hasLlmsTxt)) {
|
|
215
|
+
return appendMdExtension(url);
|
|
216
|
+
}
|
|
217
|
+
return null;
|
|
218
|
+
}
|
|
219
|
+
function appendMdExtension(url) {
|
|
220
|
+
return url.endsWith("/") ? `${url.slice(0, -1)}.md` : `${url}.md`;
|
|
221
|
+
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "markit-ai",
|
|
3
|
-
"version": "0.5.
|
|
3
|
+
"version": "0.5.2",
|
|
4
4
|
"description": "Convert anything to markdown. PDF, DOCX, PPTX, XLSX, HTML, EPUB, Jupyter, RSS, images, audio, URLs, and more. Pluggable converters, built-in LLM providers for image description and audio transcription. Works as a CLI and as a library.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "dist/index.js",
|