markit-ai 0.5.1 → 0.5.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/markit.d.ts +26 -0
- package/dist/markit.js +136 -7
- package/package.json +1 -1
package/dist/markit.d.ts
CHANGED
|
@@ -12,8 +12,34 @@ export declare class Markit {
|
|
|
12
12
|
* Convert a URL to markdown.
|
|
13
13
|
*/
|
|
14
14
|
convertUrl(url: string): Promise<ConversionResult>;
|
|
15
|
+
/**
|
|
16
|
+
* For root URLs, check if the site publishes /llms.txt.
|
|
17
|
+
* If it exists, return it as markdown directly.
|
|
18
|
+
*/
|
|
19
|
+
private tryLlmsTxt;
|
|
20
|
+
/**
|
|
21
|
+
* Inspect an HTML response for a discoverable markdown source URL.
|
|
22
|
+
* If found, fetch and convert the raw markdown instead.
|
|
23
|
+
*/
|
|
24
|
+
private tryMarkdownSource;
|
|
25
|
+
/**
|
|
26
|
+
* Fetch a markdown source URL, validating the response is actually markdown.
|
|
27
|
+
*/
|
|
28
|
+
private fetchMarkdownSource;
|
|
15
29
|
/**
|
|
16
30
|
* Convert a buffer with stream info to markdown.
|
|
17
31
|
*/
|
|
18
32
|
convert(input: Buffer, streamInfo: StreamInfo): Promise<ConversionResult>;
|
|
19
33
|
}
|
|
34
|
+
/**
|
|
35
|
+
* Try to discover a raw markdown source URL from an HTML response.
|
|
36
|
+
* Checks for known markers in the HTML itself:
|
|
37
|
+
* 1. <link rel="alternate" type="text/markdown" href="..."> tag
|
|
38
|
+
* 2. VitePress markers → append .md to the URL
|
|
39
|
+
*
|
|
40
|
+
* The llms.txt .md probe is handled separately in tryMarkdownSource
|
|
41
|
+
* as a fallback when no markers are found.
|
|
42
|
+
*
|
|
43
|
+
* @internal Exported for testing.
|
|
44
|
+
*/
|
|
45
|
+
export declare function discoverMarkdownSource(html: string, url: string, ext: string): string | null;
|
package/dist/markit.js
CHANGED
|
@@ -19,6 +19,7 @@ import { XlsxConverter } from "./converters/xlsx.js";
|
|
|
19
19
|
import { XmlConverter } from "./converters/xml.js";
|
|
20
20
|
import { YamlConverter } from "./converters/yaml.js";
|
|
21
21
|
import { ZipConverter } from "./converters/zip.js";
|
|
22
|
+
const USER_AGENT = "markit/0.1.0";
|
|
22
23
|
export class Markit {
|
|
23
24
|
converters = [];
|
|
24
25
|
options;
|
|
@@ -86,28 +87,121 @@ export class Markit {
|
|
|
86
87
|
// Fall through to default fetch path
|
|
87
88
|
}
|
|
88
89
|
}
|
|
90
|
+
// For root URLs, check if the site has /llms.txt and return it if so
|
|
91
|
+
const parsedUrl = new URL(url);
|
|
92
|
+
if (parsedUrl.pathname === "/" || parsedUrl.pathname === "") {
|
|
93
|
+
const result = await this.tryLlmsTxt(parsedUrl.origin);
|
|
94
|
+
if (result)
|
|
95
|
+
return result;
|
|
96
|
+
}
|
|
89
97
|
const response = await fetch(url, {
|
|
90
98
|
headers: {
|
|
91
99
|
Accept: "text/markdown, text/html;q=0.9, text/plain;q=0.8, */*;q=0.1",
|
|
92
|
-
"User-Agent":
|
|
100
|
+
"User-Agent": USER_AGENT,
|
|
93
101
|
},
|
|
94
102
|
});
|
|
95
103
|
if (!response.ok) {
|
|
96
104
|
throw new Error(`Failed to fetch ${url}: ${response.status} ${response.statusText}`);
|
|
97
105
|
}
|
|
98
106
|
const contentType = response.headers.get("content-type") || "";
|
|
99
|
-
const
|
|
100
|
-
// Derive extension from URL path
|
|
107
|
+
const mimetype = contentType.split(";")[0].trim();
|
|
101
108
|
const urlPath = new URL(url).pathname;
|
|
102
109
|
const ext = extname(urlPath).toLowerCase();
|
|
110
|
+
// Content negotiation worked — server returned markdown directly
|
|
111
|
+
if (mimetype === "text/markdown") {
|
|
112
|
+
const buffer = Buffer.from(await response.arrayBuffer());
|
|
113
|
+
return this.convert(buffer, {
|
|
114
|
+
url,
|
|
115
|
+
mimetype: "text/markdown",
|
|
116
|
+
extension: ".md",
|
|
117
|
+
filename: basename(urlPath) || undefined,
|
|
118
|
+
});
|
|
119
|
+
}
|
|
103
120
|
const buffer = Buffer.from(await response.arrayBuffer());
|
|
104
|
-
|
|
121
|
+
// For HTML responses, try to discover a raw markdown source.
|
|
122
|
+
// Patterns: <link rel="alternate">, VitePress .md files, llms.txt convention.
|
|
123
|
+
if (mimetype === "text/html") {
|
|
124
|
+
const result = await this.tryMarkdownSource(buffer, url, ext);
|
|
125
|
+
if (result)
|
|
126
|
+
return result;
|
|
127
|
+
}
|
|
128
|
+
return this.convert(buffer, {
|
|
105
129
|
url,
|
|
106
|
-
mimetype
|
|
130
|
+
mimetype,
|
|
107
131
|
extension: ext || undefined,
|
|
108
132
|
filename: basename(urlPath) || undefined,
|
|
109
|
-
};
|
|
110
|
-
|
|
133
|
+
});
|
|
134
|
+
}
|
|
135
|
+
/**
|
|
136
|
+
* For root URLs, check if the site publishes /llms.txt.
|
|
137
|
+
* If it exists, return it as markdown directly.
|
|
138
|
+
*/
|
|
139
|
+
async tryLlmsTxt(origin) {
|
|
140
|
+
const llmsTxtUrl = `${origin}/llms.txt`;
|
|
141
|
+
try {
|
|
142
|
+
const response = await fetch(llmsTxtUrl, {
|
|
143
|
+
method: "HEAD",
|
|
144
|
+
headers: { "User-Agent": USER_AGENT },
|
|
145
|
+
});
|
|
146
|
+
if (!response.ok)
|
|
147
|
+
return null;
|
|
148
|
+
const ct = (response.headers.get("content-type") || "")
|
|
149
|
+
.split(";")[0]
|
|
150
|
+
.trim();
|
|
151
|
+
if (!ct.includes("markdown") &&
|
|
152
|
+
!ct.includes("text/plain") &&
|
|
153
|
+
!ct.includes("text/html"))
|
|
154
|
+
return null;
|
|
155
|
+
// HEAD succeeded — now GET the content
|
|
156
|
+
const getResponse = await fetch(llmsTxtUrl, {
|
|
157
|
+
headers: { "User-Agent": USER_AGENT },
|
|
158
|
+
});
|
|
159
|
+
if (!getResponse.ok)
|
|
160
|
+
return null;
|
|
161
|
+
const buffer = Buffer.from(await getResponse.arrayBuffer());
|
|
162
|
+
return { markdown: buffer.toString("utf-8") };
|
|
163
|
+
}
|
|
164
|
+
catch {
|
|
165
|
+
return null;
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
/**
|
|
169
|
+
* Inspect an HTML response for a discoverable markdown source URL.
|
|
170
|
+
* If found, fetch and convert the raw markdown instead.
|
|
171
|
+
*/
|
|
172
|
+
async tryMarkdownSource(htmlBuffer, url, ext) {
|
|
173
|
+
const html = htmlBuffer.toString("utf-8", 0, Math.min(htmlBuffer.length, 50_000));
|
|
174
|
+
const mdSourceUrl = discoverMarkdownSource(html, url, ext);
|
|
175
|
+
if (!mdSourceUrl)
|
|
176
|
+
return null;
|
|
177
|
+
return this.fetchMarkdownSource(mdSourceUrl);
|
|
178
|
+
}
|
|
179
|
+
/**
|
|
180
|
+
* Fetch a markdown source URL, validating the response is actually markdown.
|
|
181
|
+
*/
|
|
182
|
+
async fetchMarkdownSource(mdUrl) {
|
|
183
|
+
try {
|
|
184
|
+
const response = await fetch(mdUrl, {
|
|
185
|
+
headers: { "User-Agent": USER_AGENT },
|
|
186
|
+
});
|
|
187
|
+
if (!response.ok)
|
|
188
|
+
return null;
|
|
189
|
+
const ct = (response.headers.get("content-type") || "")
|
|
190
|
+
.split(";")[0]
|
|
191
|
+
.trim();
|
|
192
|
+
if (!ct.includes("markdown") && !ct.includes("text/plain"))
|
|
193
|
+
return null;
|
|
194
|
+
const mdBuffer = Buffer.from(await response.arrayBuffer());
|
|
195
|
+
return this.convert(mdBuffer, {
|
|
196
|
+
url: mdUrl,
|
|
197
|
+
mimetype: "text/markdown",
|
|
198
|
+
extension: ".md",
|
|
199
|
+
filename: basename(new URL(mdUrl).pathname),
|
|
200
|
+
});
|
|
201
|
+
}
|
|
202
|
+
catch {
|
|
203
|
+
return null;
|
|
204
|
+
}
|
|
111
205
|
}
|
|
112
206
|
/**
|
|
113
207
|
* Convert a buffer with stream info to markdown.
|
|
@@ -136,3 +230,38 @@ export class Markit {
|
|
|
136
230
|
throw new Error(`Unsupported format: ${streamInfo.extension || streamInfo.mimetype || "unknown"}`);
|
|
137
231
|
}
|
|
138
232
|
}
|
|
233
|
+
/**
|
|
234
|
+
* Try to discover a raw markdown source URL from an HTML response.
|
|
235
|
+
* Checks for known markers in the HTML itself:
|
|
236
|
+
* 1. <link rel="alternate" type="text/markdown" href="..."> tag
|
|
237
|
+
* 2. VitePress markers → append .md to the URL
|
|
238
|
+
*
|
|
239
|
+
* The llms.txt .md probe is handled separately in tryMarkdownSource
|
|
240
|
+
* as a fallback when no markers are found.
|
|
241
|
+
*
|
|
242
|
+
* @internal Exported for testing.
|
|
243
|
+
*/
|
|
244
|
+
export function discoverMarkdownSource(html, url, ext) {
|
|
245
|
+
// 1. Look for <link rel="alternate" type="text/markdown" href="...">
|
|
246
|
+
const linkMatch = html.match(/<link[^>]+rel=["']alternate["'][^>]+type=["']text\/markdown["'][^>]+href=["']([^"']+)["']/i) ??
|
|
247
|
+
html.match(/<link[^>]+type=["']text\/markdown["'][^>]+rel=["']alternate["'][^>]+href=["']([^"']+)["']/i);
|
|
248
|
+
if (linkMatch?.[1]) {
|
|
249
|
+
try {
|
|
250
|
+
return new URL(linkMatch[1], url).href;
|
|
251
|
+
}
|
|
252
|
+
catch {
|
|
253
|
+
/* ignore malformed URLs */
|
|
254
|
+
}
|
|
255
|
+
}
|
|
256
|
+
// 2. VitePress detection — serves .md alongside HTML
|
|
257
|
+
if (!ext &&
|
|
258
|
+
(html.includes("__VP_HASH_MAP__") ||
|
|
259
|
+
html.includes("VPContent") ||
|
|
260
|
+
html.includes("vitepress"))) {
|
|
261
|
+
return appendMdExtension(url);
|
|
262
|
+
}
|
|
263
|
+
return null;
|
|
264
|
+
}
|
|
265
|
+
function appendMdExtension(url) {
|
|
266
|
+
return url.endsWith("/") ? `${url.slice(0, -1)}.md` : `${url}.md`;
|
|
267
|
+
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "markit-ai",
|
|
3
|
-
"version": "0.5.
|
|
3
|
+
"version": "0.5.3",
|
|
4
4
|
"description": "Convert anything to markdown. PDF, DOCX, PPTX, XLSX, HTML, EPUB, Jupyter, RSS, images, audio, URLs, and more. Pluggable converters, built-in LLM providers for image description and audio transcription. Works as a CLI and as a library.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "dist/index.js",
|