markit-ai 0.5.1 → 0.5.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/markit.d.ts CHANGED
@@ -12,8 +12,34 @@ export declare class Markit {
12
12
  * Convert a URL to markdown.
13
13
  */
14
14
  convertUrl(url: string): Promise<ConversionResult>;
15
+ /**
16
+ * For root URLs, check if the site publishes /llms.txt.
17
+ * If it exists, return it as markdown directly.
18
+ */
19
+ private tryLlmsTxt;
20
+ /**
21
+ * Inspect an HTML response for a discoverable markdown source URL.
22
+ * If found, fetch and convert the raw markdown instead.
23
+ */
24
+ private tryMarkdownSource;
25
+ /**
26
+ * Fetch a markdown source URL, validating the response is actually markdown.
27
+ */
28
+ private fetchMarkdownSource;
15
29
  /**
16
30
  * Convert a buffer with stream info to markdown.
17
31
  */
18
32
  convert(input: Buffer, streamInfo: StreamInfo): Promise<ConversionResult>;
19
33
  }
34
+ /**
35
+ * Try to discover a raw markdown source URL from an HTML response.
36
+ * Checks for known markers in the HTML itself:
37
+ * 1. <link rel="alternate" type="text/markdown" href="..."> tag
38
+ * 2. VitePress markers → append .md to the URL
39
+ *
40
+ * The llms.txt .md probe is handled separately in tryMarkdownSource
41
+ * as a fallback when no markers are found.
42
+ *
43
+ * @internal Exported for testing.
44
+ */
45
+ export declare function discoverMarkdownSource(html: string, url: string, ext: string): string | null;
package/dist/markit.js CHANGED
@@ -19,6 +19,7 @@ import { XlsxConverter } from "./converters/xlsx.js";
19
19
  import { XmlConverter } from "./converters/xml.js";
20
20
  import { YamlConverter } from "./converters/yaml.js";
21
21
  import { ZipConverter } from "./converters/zip.js";
22
+ const USER_AGENT = "markit/0.1.0";
22
23
  export class Markit {
23
24
  converters = [];
24
25
  options;
@@ -86,28 +87,121 @@ export class Markit {
86
87
  // Fall through to default fetch path
87
88
  }
88
89
  }
90
+ // For root URLs, check if the site has /llms.txt and return it if so
91
+ const parsedUrl = new URL(url);
92
+ if (parsedUrl.pathname === "/" || parsedUrl.pathname === "") {
93
+ const result = await this.tryLlmsTxt(parsedUrl.origin);
94
+ if (result)
95
+ return result;
96
+ }
89
97
  const response = await fetch(url, {
90
98
  headers: {
91
99
  Accept: "text/markdown, text/html;q=0.9, text/plain;q=0.8, */*;q=0.1",
92
- "User-Agent": "mill/0.1.0",
100
+ "User-Agent": USER_AGENT,
93
101
  },
94
102
  });
95
103
  if (!response.ok) {
96
104
  throw new Error(`Failed to fetch ${url}: ${response.status} ${response.statusText}`);
97
105
  }
98
106
  const contentType = response.headers.get("content-type") || "";
99
- const [mimetype] = contentType.split(";");
100
- // Derive extension from URL path
107
+ const mimetype = contentType.split(";")[0].trim();
101
108
  const urlPath = new URL(url).pathname;
102
109
  const ext = extname(urlPath).toLowerCase();
110
+ // Content negotiation worked — server returned markdown directly
111
+ if (mimetype === "text/markdown") {
112
+ const buffer = Buffer.from(await response.arrayBuffer());
113
+ return this.convert(buffer, {
114
+ url,
115
+ mimetype: "text/markdown",
116
+ extension: ".md",
117
+ filename: basename(urlPath) || undefined,
118
+ });
119
+ }
103
120
  const buffer = Buffer.from(await response.arrayBuffer());
104
- const fetchedInfo = {
121
+ // For HTML responses, try to discover a raw markdown source.
122
+ // Patterns: <link rel="alternate">, VitePress .md files, llms.txt convention.
123
+ if (mimetype === "text/html") {
124
+ const result = await this.tryMarkdownSource(buffer, url, ext);
125
+ if (result)
126
+ return result;
127
+ }
128
+ return this.convert(buffer, {
105
129
  url,
106
- mimetype: mimetype.trim(),
130
+ mimetype,
107
131
  extension: ext || undefined,
108
132
  filename: basename(urlPath) || undefined,
109
- };
110
- return this.convert(buffer, fetchedInfo);
133
+ });
134
+ }
135
+ /**
136
+ * For root URLs, check if the site publishes /llms.txt.
137
+ * If it exists, return it as markdown directly.
138
+ */
139
+ async tryLlmsTxt(origin) {
140
+ const llmsTxtUrl = `${origin}/llms.txt`;
141
+ try {
142
+ const response = await fetch(llmsTxtUrl, {
143
+ method: "HEAD",
144
+ headers: { "User-Agent": USER_AGENT },
145
+ });
146
+ if (!response.ok)
147
+ return null;
148
+ const ct = (response.headers.get("content-type") || "")
149
+ .split(";")[0]
150
+ .trim();
151
+ if (!ct.includes("markdown") &&
152
+ !ct.includes("text/plain") &&
153
+ !ct.includes("text/html"))
154
+ return null;
155
+ // HEAD succeeded — now GET the content
156
+ const getResponse = await fetch(llmsTxtUrl, {
157
+ headers: { "User-Agent": USER_AGENT },
158
+ });
159
+ if (!getResponse.ok)
160
+ return null;
161
+ const buffer = Buffer.from(await getResponse.arrayBuffer());
162
+ return { markdown: buffer.toString("utf-8") };
163
+ }
164
+ catch {
165
+ return null;
166
+ }
167
+ }
168
+ /**
169
+ * Inspect an HTML response for a discoverable markdown source URL.
170
+ * If found, fetch and convert the raw markdown instead.
171
+ */
172
+ async tryMarkdownSource(htmlBuffer, url, ext) {
173
+ const html = htmlBuffer.toString("utf-8", 0, Math.min(htmlBuffer.length, 50_000));
174
+ const mdSourceUrl = discoverMarkdownSource(html, url, ext);
175
+ if (!mdSourceUrl)
176
+ return null;
177
+ return this.fetchMarkdownSource(mdSourceUrl);
178
+ }
179
+ /**
180
+ * Fetch a markdown source URL, validating the response is actually markdown.
181
+ */
182
+ async fetchMarkdownSource(mdUrl) {
183
+ try {
184
+ const response = await fetch(mdUrl, {
185
+ headers: { "User-Agent": USER_AGENT },
186
+ });
187
+ if (!response.ok)
188
+ return null;
189
+ const ct = (response.headers.get("content-type") || "")
190
+ .split(";")[0]
191
+ .trim();
192
+ if (!ct.includes("markdown") && !ct.includes("text/plain"))
193
+ return null;
194
+ const mdBuffer = Buffer.from(await response.arrayBuffer());
195
+ return this.convert(mdBuffer, {
196
+ url: mdUrl,
197
+ mimetype: "text/markdown",
198
+ extension: ".md",
199
+ filename: basename(new URL(mdUrl).pathname),
200
+ });
201
+ }
202
+ catch {
203
+ return null;
204
+ }
111
205
  }
112
206
  /**
113
207
  * Convert a buffer with stream info to markdown.
@@ -136,3 +230,38 @@ export class Markit {
136
230
  throw new Error(`Unsupported format: ${streamInfo.extension || streamInfo.mimetype || "unknown"}`);
137
231
  }
138
232
  }
233
+ /**
234
+ * Try to discover a raw markdown source URL from an HTML response.
235
+ * Checks for known markers in the HTML itself:
236
+ * 1. <link rel="alternate" type="text/markdown" href="..."> tag
237
+ * 2. VitePress markers → append .md to the URL
238
+ *
239
+ * The llms.txt .md probe is handled separately in tryMarkdownSource
240
+ * as a fallback when no markers are found.
241
+ *
242
+ * @internal Exported for testing.
243
+ */
244
+ export function discoverMarkdownSource(html, url, ext) {
245
+ // 1. Look for <link rel="alternate" type="text/markdown" href="...">
246
+ const linkMatch = html.match(/<link[^>]+rel=["']alternate["'][^>]+type=["']text\/markdown["'][^>]+href=["']([^"']+)["']/i) ??
247
+ html.match(/<link[^>]+type=["']text\/markdown["'][^>]+rel=["']alternate["'][^>]+href=["']([^"']+)["']/i);
248
+ if (linkMatch?.[1]) {
249
+ try {
250
+ return new URL(linkMatch[1], url).href;
251
+ }
252
+ catch {
253
+ /* ignore malformed URLs */
254
+ }
255
+ }
256
+ // 2. VitePress detection — serves .md alongside HTML
257
+ if (!ext &&
258
+ (html.includes("__VP_HASH_MAP__") ||
259
+ html.includes("VPContent") ||
260
+ html.includes("vitepress"))) {
261
+ return appendMdExtension(url);
262
+ }
263
+ return null;
264
+ }
265
+ function appendMdExtension(url) {
266
+ return url.endsWith("/") ? `${url.slice(0, -1)}.md` : `${url}.md`;
267
+ }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "markit-ai",
3
- "version": "0.5.1",
3
+ "version": "0.5.3",
4
4
  "description": "Convert anything to markdown. PDF, DOCX, PPTX, XLSX, HTML, EPUB, Jupyter, RSS, images, audio, URLs, and more. Pluggable converters, built-in LLM providers for image description and audio transcription. Works as a CLI and as a library.",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",