npm - markit-ai - Versions diffs - 0.5.2 → 0.5.3 - Mend

markit-ai 0.5.2 → 0.5.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/dist/markit.d.ts CHANGED Viewed

@@ -12,11 +12,20 @@ export declare class Markit {
      * Convert a URL to markdown.
      */
     convertUrl(url: string): Promise<ConversionResult>;
+    /**
+     * For root URLs, check if the site publishes /llms.txt.
+     * If it exists, return it as markdown directly.
+     */
+    private tryLlmsTxt;
     /**
      * Inspect an HTML response for a discoverable markdown source URL.
      * If found, fetch and convert the raw markdown instead.
      */
     private tryMarkdownSource;
+    /**
+     * Fetch a markdown source URL, validating the response is actually markdown.
+     */
+    private fetchMarkdownSource;
     /**
      * Convert a buffer with stream info to markdown.
      */
@@ -24,10 +33,12 @@ export declare class Markit {
 }
 /**
  * Try to discover a raw markdown source URL from an HTML response.
- * Checks multiple patterns:
+ * Checks for known markers in the HTML itself:
  *   1. <link rel="alternate" type="text/markdown" href="..."> tag
  *   2. VitePress markers → append .md to the URL
- *   3. llms.txt convention → try url.md or url.html.md
+ *
+ * The llms.txt .md probe is handled separately in tryMarkdownSource
+ * as a fallback when no markers are found.
  *
  * @internal Exported for testing.
  */

package/dist/markit.js CHANGED Viewed

@@ -87,6 +87,13 @@ export class Markit {
                 // Fall through to default fetch path
             }
         }
+        // For root URLs, check if the site has /llms.txt and return it if so
+        const parsedUrl = new URL(url);
+        if (parsedUrl.pathname === "/" || parsedUrl.pathname === "") {
+            const result = await this.tryLlmsTxt(parsedUrl.origin);
+            if (result)
+                return result;
+        }
         const response = await fetch(url, {
             headers: {
                 Accept: "text/markdown, text/html;q=0.9, text/plain;q=0.8, */*;q=0.1",
@@ -125,6 +132,39 @@ export class Markit {
             filename: basename(urlPath) || undefined,
         });
     }
+    /**
+     * For root URLs, check if the site publishes /llms.txt.
+     * If it exists, return it as markdown directly.
+     */
+    async tryLlmsTxt(origin) {
+        const llmsTxtUrl = `${origin}/llms.txt`;
+        try {
+            const response = await fetch(llmsTxtUrl, {
+                method: "HEAD",
+                headers: { "User-Agent": USER_AGENT },
+            });
+            if (!response.ok)
+                return null;
+            const ct = (response.headers.get("content-type") || "")
+                .split(";")[0]
+                .trim();
+            if (!ct.includes("markdown") &&
+                !ct.includes("text/plain") &&
+                !ct.includes("text/html"))
+                return null;
+            // HEAD succeeded — now GET the content
+            const getResponse = await fetch(llmsTxtUrl, {
+                headers: { "User-Agent": USER_AGENT },
+            });
+            if (!getResponse.ok)
+                return null;
+            const buffer = Buffer.from(await getResponse.arrayBuffer());
+            return { markdown: buffer.toString("utf-8") };
+        }
+        catch {
+            return null;
+        }
+    }
     /**
      * Inspect an HTML response for a discoverable markdown source URL.
      * If found, fetch and convert the raw markdown instead.
@@ -134,8 +174,14 @@ export class Markit {
         const mdSourceUrl = discoverMarkdownSource(html, url, ext);
         if (!mdSourceUrl)
             return null;
+        return this.fetchMarkdownSource(mdSourceUrl);
+    }
+    /**
+     * Fetch a markdown source URL, validating the response is actually markdown.
+     */
+    async fetchMarkdownSource(mdUrl) {
         try {
-            const response = await fetch(mdSourceUrl, {
+            const response = await fetch(mdUrl, {
                 headers: { "User-Agent": USER_AGENT },
             });
             if (!response.ok)
@@ -147,10 +193,10 @@ export class Markit {
                 return null;
             const mdBuffer = Buffer.from(await response.arrayBuffer());
             return this.convert(mdBuffer, {
-                url: mdSourceUrl,
+                url: mdUrl,
                 mimetype: "text/markdown",
                 extension: ".md",
-                filename: basename(new URL(mdSourceUrl).pathname),
+                filename: basename(new URL(mdUrl).pathname),
             });
         }
         catch {
@@ -186,10 +232,12 @@ export class Markit {
 }
 /**
  * Try to discover a raw markdown source URL from an HTML response.
- * Checks multiple patterns:
+ * Checks for known markers in the HTML itself:
  *   1. <link rel="alternate" type="text/markdown" href="..."> tag
  *   2. VitePress markers → append .md to the URL
- *   3. llms.txt convention → try url.md or url.html.md
+ *
+ * The llms.txt .md probe is handled separately in tryMarkdownSource
+ * as a fallback when no markers are found.
  *
  * @internal Exported for testing.
  */
@@ -206,12 +254,10 @@ export function discoverMarkdownSource(html, url, ext) {
         }
     }
     // 2. VitePress detection — serves .md alongside HTML
-    const isVitePress = html.includes("__VP_HASH_MAP__") ||
-        html.includes("VPContent") ||
-        html.includes("vitepress");
-    // 3. llms.txt convention: try url.md for extensionless URLs
-    const hasLlmsTxt = html.includes("llms.txt");
-    if (!ext && (isVitePress || hasLlmsTxt)) {
+    if (!ext &&
+        (html.includes("__VP_HASH_MAP__") ||
+            html.includes("VPContent") ||
+            html.includes("vitepress"))) {
         return appendMdExtension(url);
     }
     return null;

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "markit-ai",
-  "version": "0.5.2",
+  "version": "0.5.3",
   "description": "Convert anything to markdown. PDF, DOCX, PPTX, XLSX, HTML, EPUB, Jupyter, RSS, images, audio, URLs, and more. Pluggable converters, built-in LLM providers for image description and audio transcription. Works as a CLI and as a library.",
   "type": "module",
   "main": "dist/index.js",