npm - markit-ai - Versions diffs - 0.5.1 → 0.5.3 - Mend

markit-ai 0.5.1 → 0.5.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/dist/markit.d.ts CHANGED Viewed

@@ -12,8 +12,34 @@ export declare class Markit {
      * Convert a URL to markdown.
      */
     convertUrl(url: string): Promise<ConversionResult>;
+    /**
+     * For root URLs, check if the site publishes /llms.txt.
+     * If it exists, return it as markdown directly.
+     */
+    private tryLlmsTxt;
+    /**
+     * Inspect an HTML response for a discoverable markdown source URL.
+     * If found, fetch and convert the raw markdown instead.
+     */
+    private tryMarkdownSource;
+    /**
+     * Fetch a markdown source URL, validating the response is actually markdown.
+     */
+    private fetchMarkdownSource;
     /**
      * Convert a buffer with stream info to markdown.
      */
     convert(input: Buffer, streamInfo: StreamInfo): Promise<ConversionResult>;
 }
+/**
+ * Try to discover a raw markdown source URL from an HTML response.
+ * Checks for known markers in the HTML itself:
+ *   1. <link rel="alternate" type="text/markdown" href="..."> tag
+ *   2. VitePress markers → append .md to the URL
+ *
+ * The llms.txt .md probe is handled separately in tryMarkdownSource
+ * as a fallback when no markers are found.
+ *
+ * @internal Exported for testing.
+ */
+export declare function discoverMarkdownSource(html: string, url: string, ext: string): string | null;

package/dist/markit.js CHANGED Viewed

@@ -19,6 +19,7 @@ import { XlsxConverter } from "./converters/xlsx.js";
 import { XmlConverter } from "./converters/xml.js";
 import { YamlConverter } from "./converters/yaml.js";
 import { ZipConverter } from "./converters/zip.js";
+const USER_AGENT = "markit/0.1.0";
 export class Markit {
     converters = [];
     options;
@@ -86,28 +87,121 @@ export class Markit {
                 // Fall through to default fetch path
             }
         }
+        // For root URLs, check if the site has /llms.txt and return it if so
+        const parsedUrl = new URL(url);
+        if (parsedUrl.pathname === "/" || parsedUrl.pathname === "") {
+            const result = await this.tryLlmsTxt(parsedUrl.origin);
+            if (result)
+                return result;
+        }
         const response = await fetch(url, {
             headers: {
                 Accept: "text/markdown, text/html;q=0.9, text/plain;q=0.8, */*;q=0.1",
-                "User-Agent": "mill/0.1.0",
+                "User-Agent": USER_AGENT,
             },
         });
         if (!response.ok) {
             throw new Error(`Failed to fetch ${url}: ${response.status} ${response.statusText}`);
         }
         const contentType = response.headers.get("content-type") || "";
-        const [mimetype] = contentType.split(";");
-        // Derive extension from URL path
+        const mimetype = contentType.split(";")[0].trim();
         const urlPath = new URL(url).pathname;
         const ext = extname(urlPath).toLowerCase();
+        // Content negotiation worked — server returned markdown directly
+        if (mimetype === "text/markdown") {
+            const buffer = Buffer.from(await response.arrayBuffer());
+            return this.convert(buffer, {
+                url,
+                mimetype: "text/markdown",
+                extension: ".md",
+                filename: basename(urlPath) || undefined,
+            });
+        }
         const buffer = Buffer.from(await response.arrayBuffer());
-        const fetchedInfo = {
+        // For HTML responses, try to discover a raw markdown source.
+        // Patterns: <link rel="alternate">, VitePress .md files, llms.txt convention.
+        if (mimetype === "text/html") {
+            const result = await this.tryMarkdownSource(buffer, url, ext);
+            if (result)
+                return result;
+        }
+        return this.convert(buffer, {
             url,
-            mimetype: mimetype.trim(),
+            mimetype,
             extension: ext || undefined,
             filename: basename(urlPath) || undefined,
-        };
-        return this.convert(buffer, fetchedInfo);
+        });
+    }
+    /**
+     * For root URLs, check if the site publishes /llms.txt.
+     * If it exists, return it as markdown directly.
+     */
+    async tryLlmsTxt(origin) {
+        const llmsTxtUrl = `${origin}/llms.txt`;
+        try {
+            const response = await fetch(llmsTxtUrl, {
+                method: "HEAD",
+                headers: { "User-Agent": USER_AGENT },
+            });
+            if (!response.ok)
+                return null;
+            const ct = (response.headers.get("content-type") || "")
+                .split(";")[0]
+                .trim();
+            if (!ct.includes("markdown") &&
+                !ct.includes("text/plain") &&
+                !ct.includes("text/html"))
+                return null;
+            // HEAD succeeded — now GET the content
+            const getResponse = await fetch(llmsTxtUrl, {
+                headers: { "User-Agent": USER_AGENT },
+            });
+            if (!getResponse.ok)
+                return null;
+            const buffer = Buffer.from(await getResponse.arrayBuffer());
+            return { markdown: buffer.toString("utf-8") };
+        }
+        catch {
+            return null;
+        }
+    }
+    /**
+     * Inspect an HTML response for a discoverable markdown source URL.
+     * If found, fetch and convert the raw markdown instead.
+     */
+    async tryMarkdownSource(htmlBuffer, url, ext) {
+        const html = htmlBuffer.toString("utf-8", 0, Math.min(htmlBuffer.length, 50_000));
+        const mdSourceUrl = discoverMarkdownSource(html, url, ext);
+        if (!mdSourceUrl)
+            return null;
+        return this.fetchMarkdownSource(mdSourceUrl);
+    }
+    /**
+     * Fetch a markdown source URL, validating the response is actually markdown.
+     */
+    async fetchMarkdownSource(mdUrl) {
+        try {
+            const response = await fetch(mdUrl, {
+                headers: { "User-Agent": USER_AGENT },
+            });
+            if (!response.ok)
+                return null;
+            const ct = (response.headers.get("content-type") || "")
+                .split(";")[0]
+                .trim();
+            if (!ct.includes("markdown") && !ct.includes("text/plain"))
+                return null;
+            const mdBuffer = Buffer.from(await response.arrayBuffer());
+            return this.convert(mdBuffer, {
+                url: mdUrl,
+                mimetype: "text/markdown",
+                extension: ".md",
+                filename: basename(new URL(mdUrl).pathname),
+            });
+        }
+        catch {
+            return null;
+        }
     }
     /**
      * Convert a buffer with stream info to markdown.
@@ -136,3 +230,38 @@ export class Markit {
         throw new Error(`Unsupported format: ${streamInfo.extension || streamInfo.mimetype || "unknown"}`);
     }
 }
+/**
+ * Try to discover a raw markdown source URL from an HTML response.
+ * Checks for known markers in the HTML itself:
+ *   1. <link rel="alternate" type="text/markdown" href="..."> tag
+ *   2. VitePress markers → append .md to the URL
+ *
+ * The llms.txt .md probe is handled separately in tryMarkdownSource
+ * as a fallback when no markers are found.
+ *
+ * @internal Exported for testing.
+ */
+export function discoverMarkdownSource(html, url, ext) {
+    // 1. Look for <link rel="alternate" type="text/markdown" href="...">
+    const linkMatch = html.match(/<link[^>]+rel=["']alternate["'][^>]+type=["']text\/markdown["'][^>]+href=["']([^"']+)["']/i) ??
+        html.match(/<link[^>]+type=["']text\/markdown["'][^>]+rel=["']alternate["'][^>]+href=["']([^"']+)["']/i);
+    if (linkMatch?.[1]) {
+        try {
+            return new URL(linkMatch[1], url).href;
+        }
+        catch {
+            /* ignore malformed URLs */
+        }
+    }
+    // 2. VitePress detection — serves .md alongside HTML
+    if (!ext &&
+        (html.includes("__VP_HASH_MAP__") ||
+            html.includes("VPContent") ||
+            html.includes("vitepress"))) {
+        return appendMdExtension(url);
+    }
+    return null;
+}
+function appendMdExtension(url) {
+    return url.endsWith("/") ? `${url.slice(0, -1)}.md` : `${url}.md`;
+}

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "markit-ai",
-  "version": "0.5.1",
+  "version": "0.5.3",
   "description": "Convert anything to markdown. PDF, DOCX, PPTX, XLSX, HTML, EPUB, Jupyter, RSS, images, audio, URLs, and more. Pluggable converters, built-in LLM providers for image description and audio transcription. Works as a CLI and as a library.",
   "type": "module",
   "main": "dist/index.js",