markit-ai 0.5.2 → 0.5.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/markit.d.ts +13 -2
- package/dist/markit.js +57 -11
- package/package.json +1 -1
package/dist/markit.d.ts
CHANGED
|
@@ -12,11 +12,20 @@ export declare class Markit {
|
|
|
12
12
|
* Convert a URL to markdown.
|
|
13
13
|
*/
|
|
14
14
|
convertUrl(url: string): Promise<ConversionResult>;
|
|
15
|
+
/**
|
|
16
|
+
* For root URLs, check if the site publishes /llms.txt.
|
|
17
|
+
* If it exists, return it as markdown directly.
|
|
18
|
+
*/
|
|
19
|
+
private tryLlmsTxt;
|
|
15
20
|
/**
|
|
16
21
|
* Inspect an HTML response for a discoverable markdown source URL.
|
|
17
22
|
* If found, fetch and convert the raw markdown instead.
|
|
18
23
|
*/
|
|
19
24
|
private tryMarkdownSource;
|
|
25
|
+
/**
|
|
26
|
+
* Fetch a markdown source URL, validating the response is actually markdown.
|
|
27
|
+
*/
|
|
28
|
+
private fetchMarkdownSource;
|
|
20
29
|
/**
|
|
21
30
|
* Convert a buffer with stream info to markdown.
|
|
22
31
|
*/
|
|
@@ -24,10 +33,12 @@ export declare class Markit {
|
|
|
24
33
|
}
|
|
25
34
|
/**
|
|
26
35
|
* Try to discover a raw markdown source URL from an HTML response.
|
|
27
|
-
* Checks
|
|
36
|
+
* Checks for known markers in the HTML itself:
|
|
28
37
|
* 1. <link rel="alternate" type="text/markdown" href="..."> tag
|
|
29
38
|
* 2. VitePress markers → append .md to the URL
|
|
30
|
-
*
|
|
39
|
+
*
|
|
40
|
+
* The llms.txt .md probe is handled separately in tryMarkdownSource
|
|
41
|
+
* as a fallback when no markers are found.
|
|
31
42
|
*
|
|
32
43
|
* @internal Exported for testing.
|
|
33
44
|
*/
|
package/dist/markit.js
CHANGED
|
@@ -87,6 +87,13 @@ export class Markit {
|
|
|
87
87
|
// Fall through to default fetch path
|
|
88
88
|
}
|
|
89
89
|
}
|
|
90
|
+
// For root URLs, check if the site has /llms.txt and return it if so
|
|
91
|
+
const parsedUrl = new URL(url);
|
|
92
|
+
if (parsedUrl.pathname === "/" || parsedUrl.pathname === "") {
|
|
93
|
+
const result = await this.tryLlmsTxt(parsedUrl.origin);
|
|
94
|
+
if (result)
|
|
95
|
+
return result;
|
|
96
|
+
}
|
|
90
97
|
const response = await fetch(url, {
|
|
91
98
|
headers: {
|
|
92
99
|
Accept: "text/markdown, text/html;q=0.9, text/plain;q=0.8, */*;q=0.1",
|
|
@@ -125,6 +132,39 @@ export class Markit {
|
|
|
125
132
|
filename: basename(urlPath) || undefined,
|
|
126
133
|
});
|
|
127
134
|
}
|
|
135
|
+
/**
|
|
136
|
+
* For root URLs, check if the site publishes /llms.txt.
|
|
137
|
+
* If it exists, return it as markdown directly.
|
|
138
|
+
*/
|
|
139
|
+
async tryLlmsTxt(origin) {
|
|
140
|
+
const llmsTxtUrl = `${origin}/llms.txt`;
|
|
141
|
+
try {
|
|
142
|
+
const response = await fetch(llmsTxtUrl, {
|
|
143
|
+
method: "HEAD",
|
|
144
|
+
headers: { "User-Agent": USER_AGENT },
|
|
145
|
+
});
|
|
146
|
+
if (!response.ok)
|
|
147
|
+
return null;
|
|
148
|
+
const ct = (response.headers.get("content-type") || "")
|
|
149
|
+
.split(";")[0]
|
|
150
|
+
.trim();
|
|
151
|
+
if (!ct.includes("markdown") &&
|
|
152
|
+
!ct.includes("text/plain") &&
|
|
153
|
+
!ct.includes("text/html"))
|
|
154
|
+
return null;
|
|
155
|
+
// HEAD succeeded — now GET the content
|
|
156
|
+
const getResponse = await fetch(llmsTxtUrl, {
|
|
157
|
+
headers: { "User-Agent": USER_AGENT },
|
|
158
|
+
});
|
|
159
|
+
if (!getResponse.ok)
|
|
160
|
+
return null;
|
|
161
|
+
const buffer = Buffer.from(await getResponse.arrayBuffer());
|
|
162
|
+
return { markdown: buffer.toString("utf-8") };
|
|
163
|
+
}
|
|
164
|
+
catch {
|
|
165
|
+
return null;
|
|
166
|
+
}
|
|
167
|
+
}
|
|
128
168
|
/**
|
|
129
169
|
* Inspect an HTML response for a discoverable markdown source URL.
|
|
130
170
|
* If found, fetch and convert the raw markdown instead.
|
|
@@ -134,8 +174,14 @@ export class Markit {
|
|
|
134
174
|
const mdSourceUrl = discoverMarkdownSource(html, url, ext);
|
|
135
175
|
if (!mdSourceUrl)
|
|
136
176
|
return null;
|
|
177
|
+
return this.fetchMarkdownSource(mdSourceUrl);
|
|
178
|
+
}
|
|
179
|
+
/**
|
|
180
|
+
* Fetch a markdown source URL, validating the response is actually markdown.
|
|
181
|
+
*/
|
|
182
|
+
async fetchMarkdownSource(mdUrl) {
|
|
137
183
|
try {
|
|
138
|
-
const response = await fetch(
|
|
184
|
+
const response = await fetch(mdUrl, {
|
|
139
185
|
headers: { "User-Agent": USER_AGENT },
|
|
140
186
|
});
|
|
141
187
|
if (!response.ok)
|
|
@@ -147,10 +193,10 @@ export class Markit {
|
|
|
147
193
|
return null;
|
|
148
194
|
const mdBuffer = Buffer.from(await response.arrayBuffer());
|
|
149
195
|
return this.convert(mdBuffer, {
|
|
150
|
-
url:
|
|
196
|
+
url: mdUrl,
|
|
151
197
|
mimetype: "text/markdown",
|
|
152
198
|
extension: ".md",
|
|
153
|
-
filename: basename(new URL(
|
|
199
|
+
filename: basename(new URL(mdUrl).pathname),
|
|
154
200
|
});
|
|
155
201
|
}
|
|
156
202
|
catch {
|
|
@@ -186,10 +232,12 @@ export class Markit {
|
|
|
186
232
|
}
|
|
187
233
|
/**
|
|
188
234
|
* Try to discover a raw markdown source URL from an HTML response.
|
|
189
|
-
* Checks
|
|
235
|
+
* Checks for known markers in the HTML itself:
|
|
190
236
|
* 1. <link rel="alternate" type="text/markdown" href="..."> tag
|
|
191
237
|
* 2. VitePress markers → append .md to the URL
|
|
192
|
-
*
|
|
238
|
+
*
|
|
239
|
+
* The llms.txt .md probe is handled separately in tryMarkdownSource
|
|
240
|
+
* as a fallback when no markers are found.
|
|
193
241
|
*
|
|
194
242
|
* @internal Exported for testing.
|
|
195
243
|
*/
|
|
@@ -206,12 +254,10 @@ export function discoverMarkdownSource(html, url, ext) {
|
|
|
206
254
|
}
|
|
207
255
|
}
|
|
208
256
|
// 2. VitePress detection — serves .md alongside HTML
|
|
209
|
-
|
|
210
|
-
html.includes("
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
const hasLlmsTxt = html.includes("llms.txt");
|
|
214
|
-
if (!ext && (isVitePress || hasLlmsTxt)) {
|
|
257
|
+
if (!ext &&
|
|
258
|
+
(html.includes("__VP_HASH_MAP__") ||
|
|
259
|
+
html.includes("VPContent") ||
|
|
260
|
+
html.includes("vitepress"))) {
|
|
215
261
|
return appendMdExtension(url);
|
|
216
262
|
}
|
|
217
263
|
return null;
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "markit-ai",
|
|
3
|
-
"version": "0.5.
|
|
3
|
+
"version": "0.5.3",
|
|
4
4
|
"description": "Convert anything to markdown. PDF, DOCX, PPTX, XLSX, HTML, EPUB, Jupyter, RSS, images, audio, URLs, and more. Pluggable converters, built-in LLM providers for image description and audio transcription. Works as a CLI and as a library.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "dist/index.js",
|