npm - extract-from-sitemap - Versions diffs - 0.0.23 → 0.0.24 - Mend

extract-from-sitemap 0.0.23 → 0.0.24

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/cli.js CHANGED Viewed

@@ -222,6 +222,7 @@ async function loadConfig() {
           outDir: "./docs",
           sources: [
             {
+              type: "llms.txt",
               title: "Parallel AI Documentation",
               origin: "https://docs.parallel.ai",
               forceExtract: false,
@@ -229,6 +230,7 @@ async function loadConfig() {
               keepOriginalUrls: false,
             },
             {
+              type: "sitemap",
               title: "Parallel AI Website",
               origin: "https://parallel.ai",
               forceExtract: true,
@@ -236,6 +238,7 @@ async function loadConfig() {
               keepOriginalUrls: false,
             },
             {
+              type: "custom",
               title: "Custom Resources",
               forceExtract: true,
               outDir: "./docs/custom",
@@ -250,6 +253,7 @@ async function loadConfig() {
               ],
             },
             {
+              type: "custom",
               title: "External References",
               keepOriginalUrls: true,
               forceExtract: false,
@@ -291,6 +295,14 @@ async function loadConfig() {
       if (!sourceConfig.title) {
         throw new Error(`sources[${index}].title is required`);
       }
+      if (!sourceConfig.type) {
+        throw new Error(`sources[${index}].type is required`);
+      }
+      if (!["llms.txt", "sitemap", "custom"].includes(sourceConfig.type)) {
+        throw new Error(
+          `sources[${index}].type must be one of: llms.txt, sitemap, custom`
+        );
+      }
       // Set defaults
       sourceConfig.forceExtract = sourceConfig.forceExtract ?? false;
@@ -313,13 +325,21 @@ async function loadConfig() {
         }
       }
-      // Either origin or customUrls must be provided
+      // Validate type-specific requirements
+      if (
+        (sourceConfig.type === "llms.txt" || sourceConfig.type === "sitemap") &&
+        !sourceConfig.origin
+      ) {
+        throw new Error(
+          `sources[${index}] with type '${sourceConfig.type}' requires origin`
+        );
+      }
       if (
-        !sourceConfig.origin &&
+        sourceConfig.type === "custom" &&
         (!sourceConfig.customUrls || sourceConfig.customUrls.length === 0)
       ) {
         throw new Error(
-          `sources[${index}] must have either origin or customUrls`
+          `sources[${index}] with type 'custom' requires customUrls with at least one entry`
         );
       }

package/mod.js CHANGED Viewed

@@ -26,8 +26,9 @@ import { parseLlmsTxt } from "parse-llms-txt";
 /**
  * @typedef {Object} SourceConfig
+ * @property {"llms.txt" | "sitemap" | "custom"} type - The source type
  * @property {string} title - The title for this source
- * @property {string} [origin] - The origin URL to process (optional)
+ * @property {string} [origin] - The origin URL to process (required for llms.txt and sitemap types)
  * @property {string} [outDir] - Output directory for this source's extracted files
  * @property {boolean} [forceExtract] - Whether to force extraction for this source
  * @property {boolean} [keepOriginalUrls] - Whether to keep original URL structure and not save files locally
@@ -278,8 +279,34 @@ async function extractFromLlmsTxtEntries(
 }
 /**
- * Extract content from sitemap URLs with markdown variant detection
- * Tries llms.txt first if available, then falls back to sitemap
+ * Extract content from llms.txt only (no fallback to sitemap)
+ * @param {string} origin - The origin URL to extract from
+ * @param {string} apiKey - Parallel API key
+ * @param {string} [titleRemovePattern] - Optional regex pattern to remove from titles
+ * @returns {Promise<ResponseData>}
+ */
+export async function extractFromLlmsTxt(origin, apiKey, titleRemovePattern) {
+  const llmsTxtContent = await fetchLlmsTxt(origin);
+  if (!llmsTxtContent) {
+    throw new Error(`Could not find llms.txt for ${origin}`);
+  }
+  const llmsTxt = parseLlmsTxt(llmsTxtContent);
+  const totalEntries = llmsTxt.sections.reduce(
+    (sum, section) => sum + section.files.length,
+    0,
+  );
+  if (totalEntries === 0) {
+    throw new Error(`llms.txt found but contains no entries for ${origin}`);
+  }
+  console.log(`Found llms.txt with ${totalEntries} entries for ${origin}`);
+  return extractFromLlmsTxtEntries(llmsTxt, origin, apiKey, titleRemovePattern);
+}
+/**
+ * Extract content from sitemap URLs with markdown variant detection (no llms.txt fallback)
  * @param {string} origin - The origin URL to extract from
  * @param {boolean} forceExtract - Whether to force using extract API instead of markdown variants
  * @param {string} apiKey - Parallel API key
@@ -296,29 +323,9 @@ export async function extractFromSitemap(
   let fetchCount = 0;
   let extractApiCallCount = 0;
-  // Try llms.txt first
-  const llmsTxtContent = await fetchLlmsTxt(origin);
-  if (llmsTxtContent) {
-    const llmsTxt = parseLlmsTxt(llmsTxtContent);
-    const totalEntries = llmsTxt.sections.reduce(
-      (sum, section) => sum + section.files.length,
-      0,
-    );
-    if (totalEntries > 0) {
-      console.log(`Found llms.txt with ${totalEntries} entries for ${origin}`);
-      return extractFromLlmsTxtEntries(
-        llmsTxt,
-        origin,
-        apiKey,
-        titleRemovePattern,
-      );
-    }
-  }
-  // Fall back to sitemap discovery
   const sitemapUrl = await discoverSitemap(origin);
   if (!sitemapUrl) {
-    throw new Error(`Could not find sitemap or llms.txt for ${origin}`);
+    throw new Error(`Could not find sitemap for ${origin}`);
   }
   // Parse sitemap and get URLs
@@ -531,11 +538,10 @@ export async function processLLMTextConfig(config, apiKey) {
     let sourceFiles = {};
     try {
-      // Process origin if provided
-      if (sourceConfig.origin) {
-        const result = await extractFromSitemap(
+      // Process based on type
+      if (sourceConfig.type === "llms.txt" && sourceConfig.origin) {
+        const result = await extractFromLlmsTxt(
           sourceConfig.origin,
-          sourceConfig.forceExtract || false,
           apiKey,
           sourceConfig.titleRemovePattern,
         );
@@ -544,22 +550,33 @@ export async function processLLMTextConfig(config, apiKey) {
         totalTokens += result.totalTokens;
         totalPages += result.totalPages;
         totalErrors += result.errors;
-      }
-      // Process custom URLs for this source
-      if (sourceConfig.customUrls && sourceConfig.customUrls.length > 0) {
-        const customFiles = await processCustomUrls(
-          sourceConfig.customUrls,
+      } else if (sourceConfig.type === "sitemap" && sourceConfig.origin) {
+        const result = await extractFromSitemap(
+          sourceConfig.origin,
+          sourceConfig.forceExtract || false,
           apiKey,
+          sourceConfig.titleRemovePattern,
         );
-        // Merge custom files with sitemap files
-        sourceFiles = { ...sourceFiles, ...customFiles };
-        for (const file of Object.values(customFiles)) {
-          totalTokens += file.tokens;
-          totalPages++;
-          if (file.error) totalErrors++;
+        sourceFiles = result.files;
+        totalTokens += result.totalTokens;
+        totalPages += result.totalPages;
+        totalErrors += result.errors;
+      } else if (sourceConfig.type === "custom") {
+        // Process custom URLs for this source
+        if (sourceConfig.customUrls && sourceConfig.customUrls.length > 0) {
+          const customFiles = await processCustomUrls(
+            sourceConfig.customUrls,
+            apiKey,
+          );
+          sourceFiles = customFiles;
+          for (const file of Object.values(customFiles)) {
+            totalTokens += file.tokens;
+            totalPages++;
+            if (file.error) totalErrors++;
+          }
         }
       }

package/package.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "name": "extract-from-sitemap",
   "bin": "cli.js",
-  "version": "0.0.23",
+  "version": "0.0.24",
   "main": "mod.js",
   "description": "A module and CLI that allows extracting all pages from a sitemap into markdown and a llms.txt, using Parallel.ai APIs.",
   "files": [