markit-ai 0.5.2 → 0.5.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/markit.d.ts CHANGED
@@ -12,11 +12,20 @@ export declare class Markit {
12
12
  * Convert a URL to markdown.
13
13
  */
14
14
  convertUrl(url: string): Promise<ConversionResult>;
15
+ /**
16
+ * For root URLs, check if the site publishes /llms.txt.
17
+ * If it exists, return it as markdown directly.
18
+ */
19
+ private tryLlmsTxt;
15
20
  /**
16
21
  * Inspect an HTML response for a discoverable markdown source URL.
17
22
  * If found, fetch and convert the raw markdown instead.
18
23
  */
19
24
  private tryMarkdownSource;
25
+ /**
26
+ * Fetch a markdown source URL, validating the response is actually markdown.
27
+ */
28
+ private fetchMarkdownSource;
20
29
  /**
21
30
  * Convert a buffer with stream info to markdown.
22
31
  */
@@ -24,10 +33,12 @@ export declare class Markit {
24
33
  }
25
34
  /**
26
35
  * Try to discover a raw markdown source URL from an HTML response.
27
- * Checks multiple patterns:
36
+ * Checks for known markers in the HTML itself:
28
37
  * 1. <link rel="alternate" type="text/markdown" href="..."> tag
29
38
  * 2. VitePress markers → append .md to the URL
30
- * 3. llms.txt convention → try url.md or url.html.md
39
+ *
40
+ * The llms.txt .md probe is handled separately in tryMarkdownSource
41
+ * as a fallback when no markers are found.
31
42
  *
32
43
  * @internal Exported for testing.
33
44
  */
package/dist/markit.js CHANGED
@@ -87,6 +87,13 @@ export class Markit {
87
87
  // Fall through to default fetch path
88
88
  }
89
89
  }
90
+ // For root URLs, check if the site has /llms.txt and return it if so
91
+ const parsedUrl = new URL(url);
92
+ if (parsedUrl.pathname === "/" || parsedUrl.pathname === "") {
93
+ const result = await this.tryLlmsTxt(parsedUrl.origin);
94
+ if (result)
95
+ return result;
96
+ }
90
97
  const response = await fetch(url, {
91
98
  headers: {
92
99
  Accept: "text/markdown, text/html;q=0.9, text/plain;q=0.8, */*;q=0.1",
@@ -125,6 +132,39 @@ export class Markit {
125
132
  filename: basename(urlPath) || undefined,
126
133
  });
127
134
  }
135
+ /**
136
+ * For root URLs, check if the site publishes /llms.txt.
137
+ * If it exists, return it as markdown directly.
138
+ */
139
+ async tryLlmsTxt(origin) {
140
+ const llmsTxtUrl = `${origin}/llms.txt`;
141
+ try {
142
+ const response = await fetch(llmsTxtUrl, {
143
+ method: "HEAD",
144
+ headers: { "User-Agent": USER_AGENT },
145
+ });
146
+ if (!response.ok)
147
+ return null;
148
+ const ct = (response.headers.get("content-type") || "")
149
+ .split(";")[0]
150
+ .trim();
151
+ if (!ct.includes("markdown") &&
152
+ !ct.includes("text/plain") &&
153
+ !ct.includes("text/html"))
154
+ return null;
155
+ // HEAD succeeded — now GET the content
156
+ const getResponse = await fetch(llmsTxtUrl, {
157
+ headers: { "User-Agent": USER_AGENT },
158
+ });
159
+ if (!getResponse.ok)
160
+ return null;
161
+ const buffer = Buffer.from(await getResponse.arrayBuffer());
162
+ return { markdown: buffer.toString("utf-8") };
163
+ }
164
+ catch {
165
+ return null;
166
+ }
167
+ }
128
168
  /**
129
169
  * Inspect an HTML response for a discoverable markdown source URL.
130
170
  * If found, fetch and convert the raw markdown instead.
@@ -134,8 +174,14 @@ export class Markit {
134
174
  const mdSourceUrl = discoverMarkdownSource(html, url, ext);
135
175
  if (!mdSourceUrl)
136
176
  return null;
177
+ return this.fetchMarkdownSource(mdSourceUrl);
178
+ }
179
+ /**
180
+ * Fetch a markdown source URL, validating the response is actually markdown.
181
+ */
182
+ async fetchMarkdownSource(mdUrl) {
137
183
  try {
138
- const response = await fetch(mdSourceUrl, {
184
+ const response = await fetch(mdUrl, {
139
185
  headers: { "User-Agent": USER_AGENT },
140
186
  });
141
187
  if (!response.ok)
@@ -147,10 +193,10 @@ export class Markit {
147
193
  return null;
148
194
  const mdBuffer = Buffer.from(await response.arrayBuffer());
149
195
  return this.convert(mdBuffer, {
150
- url: mdSourceUrl,
196
+ url: mdUrl,
151
197
  mimetype: "text/markdown",
152
198
  extension: ".md",
153
- filename: basename(new URL(mdSourceUrl).pathname),
199
+ filename: basename(new URL(mdUrl).pathname),
154
200
  });
155
201
  }
156
202
  catch {
@@ -186,10 +232,12 @@ export class Markit {
186
232
  }
187
233
  /**
188
234
  * Try to discover a raw markdown source URL from an HTML response.
189
- * Checks multiple patterns:
235
+ * Checks for known markers in the HTML itself:
190
236
  * 1. <link rel="alternate" type="text/markdown" href="..."> tag
191
237
  * 2. VitePress markers → append .md to the URL
192
- * 3. llms.txt convention → try url.md or url.html.md
238
+ *
239
+ * The llms.txt .md probe is handled separately in tryMarkdownSource
240
+ * as a fallback when no markers are found.
193
241
  *
194
242
  * @internal Exported for testing.
195
243
  */
@@ -206,12 +254,10 @@ export function discoverMarkdownSource(html, url, ext) {
206
254
  }
207
255
  }
208
256
  // 2. VitePress detection — serves .md alongside HTML
209
- const isVitePress = html.includes("__VP_HASH_MAP__") ||
210
- html.includes("VPContent") ||
211
- html.includes("vitepress");
212
- // 3. llms.txt convention: try url.md for extensionless URLs
213
- const hasLlmsTxt = html.includes("llms.txt");
214
- if (!ext && (isVitePress || hasLlmsTxt)) {
257
+ if (!ext &&
258
+ (html.includes("__VP_HASH_MAP__") ||
259
+ html.includes("VPContent") ||
260
+ html.includes("vitepress"))) {
215
261
  return appendMdExtension(url);
216
262
  }
217
263
  return null;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "markit-ai",
3
- "version": "0.5.2",
3
+ "version": "0.5.3",
4
4
  "description": "Convert anything to markdown. PDF, DOCX, PPTX, XLSX, HTML, EPUB, Jupyter, RSS, images, audio, URLs, and more. Pluggable converters, built-in LLM providers for image description and audio transcription. Works as a CLI and as a library.",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",