npm - extract-from-sitemap - Versions diffs - 0.0.5 → 0.0.7 - Mend

extract-from-sitemap 0.0.5 → 0.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/cli.js CHANGED Viewed

@@ -10,17 +10,19 @@ const os = require("os");
 const { extractFromSitemap } = require("./mod.js");
 /**
- * @typedef {Object} OriginConfig
- * @property {string} origin - The origin URL to process
- * @property {boolean} forceExtract - Whether to force extraction for this origin
+ * @typedef {Object} SourceConfig
+ * @property {string} title - The title for this source
+ * @property {string} [origin] - The origin URL to process (optional)
+ * @property {string} [outDir] - Output directory for this source's extracted files
+ * @property {boolean} [forceExtract] - Whether to force extraction for this source
+ * @property {boolean} [keepOriginalUrls] - Whether to keep original URL structure and not save files locally
+ * @property {Array<{title: string, description: string, filename: string, url: string}>} [customUrls] - Custom URLs to extract for this source
  */
 /**
  * @typedef {Object} Config
- * @property {string} outDir - Output directory for extracted files
- * @property {OriginConfig[]} origins - Array of origin configurations
- * @property {Array<{title: string, description: string, url: string}>} customUrls - Custom URLs to extract
- * @property {boolean} keepOriginalUrls - Whether to keep original URL structure
+ * @property {string} outDir - Top-level output directory for combined llms.txt
+ * @property {SourceConfig[]} sources - Array of source configurations
  */
 /**
@@ -217,12 +219,49 @@ async function loadConfig() {
         {
           $schema: "https://extract.llmtext.com/llmtext.schema.json",
           outDir: "./docs",
-          origins: [
-            { origin: "https://docs.parallel.ai", forceExtract: false },
-            { origin: "https://parallel.ai", forceExtract: true },
+          sources: [
+            {
+              title: "Parallel AI Documentation",
+              origin: "https://docs.parallel.ai",
+              forceExtract: false,
+              outDir: "./docs/parallel-docs",
+              keepOriginalUrls: false,
+            },
+            {
+              title: "Parallel AI Website",
+              origin: "https://parallel.ai",
+              forceExtract: true,
+              outDir: "./docs/parallel-main",
+              keepOriginalUrls: false,
+            },
+            {
+              title: "Custom Resources",
+              forceExtract: true,
+              outDir: "./docs/custom",
+              keepOriginalUrls: false,
+              customUrls: [
+                {
+                  title: "Custom Page",
+                  description: "A custom page to extract",
+                  filename: "custom-page",
+                  url: "https://example.com/page",
+                },
+              ],
+            },
+            {
+              title: "External References",
+              keepOriginalUrls: true,
+              forceExtract: false,
+              customUrls: [
+                {
+                  title: "External API Guide",
+                  description: "Third-party API documentation",
+                  filename: "external-api",
+                  url: "https://external.com/api-guide",
+                },
+              ],
+            },
           ],
-          customUrls: [],
-          keepOriginalUrls: false,
         },
         null,
         2
@@ -236,25 +275,56 @@ async function loadConfig() {
     // Validate required fields
     if (!config.outDir) throw new Error("outDir is required");
-    if (!Array.isArray(config.origins))
-      throw new Error("origins must be an array");
+    if (!Array.isArray(config.sources))
+      throw new Error("sources must be an array");
-    // Validate origin objects
-    for (const [index, originConfig] of config.origins.entries()) {
-      if (typeof originConfig !== "object" || originConfig === null) {
-        throw new Error(`origins[${index}] must be an object`);
+    // Validate source objects
+    for (const [index, sourceConfig] of config.sources.entries()) {
+      if (typeof sourceConfig !== "object" || sourceConfig === null) {
+        throw new Error(`sources[${index}] must be an object`);
       }
-      if (!originConfig.origin) {
-        throw new Error(`origins[${index}].origin is required`);
+      if (!sourceConfig.title) {
+        throw new Error(`sources[${index}].title is required`);
       }
-      if (typeof originConfig.forceExtract !== "boolean") {
-        throw new Error(`origins[${index}].forceExtract must be a boolean`);
+      // Set defaults
+      sourceConfig.forceExtract = sourceConfig.forceExtract ?? false;
+      sourceConfig.keepOriginalUrls = sourceConfig.keepOriginalUrls ?? false;
+      sourceConfig.customUrls = sourceConfig.customUrls || [];
+      // Either origin or customUrls must be provided
+      if (
+        !sourceConfig.origin &&
+        (!sourceConfig.customUrls || sourceConfig.customUrls.length === 0)
+      ) {
+        throw new Error(
+          `sources[${index}] must have either origin or customUrls`
+        );
       }
-    }
-    // Set defaults
-    config.customUrls = config.customUrls || [];
-    config.keepOriginalUrls = config.keepOriginalUrls ?? false;
+      // outDir is required unless keepOriginalUrls is true
+      if (!sourceConfig.outDir && !sourceConfig.keepOriginalUrls) {
+        throw new Error(
+          `sources[${index}].outDir is required when keepOriginalUrls is false`
+        );
+      }
+      // Validate customUrls
+      for (const [urlIndex, customUrl] of (
+        sourceConfig.customUrls || []
+      ).entries()) {
+        if (
+          !customUrl.title ||
+          !customUrl.description ||
+          !customUrl.filename ||
+          !customUrl.url
+        ) {
+          throw new Error(
+            `sources[${index}].customUrls[${urlIndex}] must have title, description, filename, and url`
+          );
+        }
+      }
+    }
     return config;
   } catch (error) {
@@ -389,7 +459,7 @@ function cleanupOldFiles(outDir, currentFiles, previousFiles) {
 /**
  * Process custom URLs through extraction API
- * @param {Array<{title: string, description: string, url: string}>} customUrls - Custom URLs to process
+ * @param {Array<{title: string, description: string, filename: string, url: string}>} customUrls - Custom URLs to process
  * @param {string} apiKey - API key for authentication
  * @returns {Promise<Record<string, any>>} Extracted files
  */
@@ -417,8 +487,7 @@ async function processCustomUrls(customUrls, apiKey) {
         const result = await response.json();
         if (result.results && result.results.length > 0) {
           const extracted = result.results[0];
-          const filename =
-            customUrl.title.replace(/[^a-zA-Z0-9]/g, "_").toLowerCase() + ".md";
+          const filename = customUrl.filename + ".md";
           files[filename] = {
             content: extracted.full_content || "",
@@ -428,6 +497,7 @@ async function processCustomUrls(customUrls, apiKey) {
             publishedDate: extracted.published_date || "",
             status: 200,
             tokens: Math.round((extracted.full_content || "").length / 5),
+            originalUrl: customUrl.url,
           };
         }
       } else {
@@ -444,6 +514,43 @@ async function processCustomUrls(customUrls, apiKey) {
   return files;
 }
+/**
+ * Generate combined llms.txt from all sources
+ * @param {Array<{title: string, files: Record<string, any>, keepOriginalUrls?: boolean}>} allSources - All processed sources
+ * @returns {string} Combined llms.txt content
+ */
+function generateCombinedLlmsTxt(allSources) {
+  let combinedTxt =
+    "# Documentation Collection\n\n> Combined documentation from multiple sources\n\n";
+  for (const source of allSources) {
+    combinedTxt += `## ${source.title}\n\n`;
+    // Sort files by path for consistent ordering
+    const sortedFiles = Object.entries(source.files).sort(([a], [b]) =>
+      a.localeCompare(b)
+    );
+    for (const [path, file] of sortedFiles) {
+      if (file.content || file.title) {
+        const title = file.title || path.replace(".md", "");
+        const description = file.description ? `: ${file.description}` : "";
+        // If keepOriginalUrls is true, link to the original URL, otherwise link to the local file
+        const link = source.keepOriginalUrls
+          ? file.originalUrl
+          : path.replace(".md", "");
+        combinedTxt += `- [${title}](${link}) (${file.tokens} tokens)${description}\n`;
+      }
+    }
+    combinedTxt += "\n";
+  }
+  return combinedTxt;
+}
 /**
  * Clear stored API key credentials
  */
@@ -477,114 +584,145 @@ async function main() {
     const config = await loadConfig();
     const apiKey = await getApiKey();
-    // Ensure output directory exists
+    // Ensure top-level output directory exists
     fs.mkdirSync(config.outDir, { recursive: true });
-    // Load previous manifest
-    const previousManifest = loadManifest(config.outDir);
-    const currentFiles = [];
+    const allSources = [];
     let totalTokens = 0;
     let totalPages = 0;
     let totalErrors = 0;
-    // Process each origin with its own forceExtract setting
-    for (const originConfig of config.origins) {
+    // Process each source
+    for (const [sourceIndex, sourceConfig] of config.sources.entries()) {
+      const sourceName = `${sourceConfig.title} (source ${sourceIndex + 1})`;
       console.log(
-        `\n🌐 Processing origin: ${originConfig.origin} (forceExtract: ${originConfig.forceExtract})`
+        `\n🌐 Processing ${sourceName} (forceExtract: ${sourceConfig.forceExtract}, keepOriginalUrls: ${sourceConfig.keepOriginalUrls})`
       );
+      // Only ensure source output directory exists if not keeping original URLs
+      if (!sourceConfig.keepOriginalUrls && sourceConfig.outDir) {
+        fs.mkdirSync(sourceConfig.outDir, { recursive: true });
+      }
+      // Load previous manifest for this source (only if we have an outDir)
+      const previousManifest = sourceConfig.outDir
+        ? loadManifest(sourceConfig.outDir)
+        : { files: [], timestamp: new Date().toISOString() };
+      const currentFiles = [];
+      let sourceFiles = {};
       try {
-        const result = await extractFromSitemap(
-          originConfig.origin,
-          originConfig.forceExtract,
-          apiKey
-        );
+        // Process origin if provided
+        if (sourceConfig.origin) {
+          const result = await extractFromSitemap(
+            sourceConfig.origin,
+            sourceConfig.forceExtract,
+            apiKey
+          );
+          console.log(
+            `✅ Extracted ${result.totalPages} pages with ${result.totalTokens} tokens`
+          );
+          if (result.errors > 0) {
+            console.log(`⚠️  ${result.errors} errors occurred`);
+          }
-        console.log(
-          `✅ Extracted ${result.totalPages} pages with ${result.totalTokens} tokens`
-        );
-        if (result.errors > 0) {
-          console.log(`⚠️  ${result.errors} errors occurred`);
+          sourceFiles = result.files;
+          totalTokens += result.totalTokens;
+          totalPages += result.totalPages;
+          totalErrors += result.errors;
         }
-        // Write files to disk
-        for (const [filePath, file] of Object.entries(result.files)) {
-          let filename = filePath;
-          if (!config.keepOriginalUrls) {
-            // Create domain-specific subdirectory
-            const domain = new URL(
-              originConfig.origin.startsWith("http")
-                ? originConfig.origin
-                : `https://${originConfig.origin}`
-            ).hostname;
-            const domainDir = path.join(config.outDir, domain);
-            fs.mkdirSync(domainDir, { recursive: true });
-            filename = path.join(
-              domain,
-              filePath.startsWith("/") ? filePath.slice(1) : filePath
-            );
-          } else {
-            filename = filePath.startsWith("/") ? filePath.slice(1) : filePath;
+        // Process custom URLs for this source
+        if (sourceConfig.customUrls && sourceConfig.customUrls.length > 0) {
+          console.log(
+            `📋 Processing ${sourceConfig.customUrls.length} custom URLs for this source...`
+          );
+          const customFiles = await processCustomUrls(
+            sourceConfig.customUrls,
+            apiKey
+          );
+          // Merge custom files with sitemap files
+          sourceFiles = { ...sourceFiles, ...customFiles };
+          for (const file of Object.values(customFiles)) {
+            totalTokens += file.tokens;
+            totalPages++;
           }
+        }
-          const fullFilePath = path.join(config.outDir, filename);
-          const fileDir = path.dirname(fullFilePath);
+        // Write files to source directory (only if not keeping original URLs)
+        if (!sourceConfig.keepOriginalUrls && sourceConfig.outDir) {
+          for (const [filePath, file] of Object.entries(sourceFiles)) {
+            let filename = filePath.startsWith("/")
+              ? filePath.slice(1)
+              : filePath;
-          fs.mkdirSync(fileDir, { recursive: true });
-          fs.writeFileSync(fullFilePath, file.content);
-          currentFiles.push(filename);
+            const fullFilePath = path.join(sourceConfig.outDir, filename);
+            const fileDir = path.dirname(fullFilePath);
-          console.log(`📝 Wrote: ${filename} (${file.tokens} tokens)`);
-        }
+            fs.mkdirSync(fileDir, { recursive: true });
+            fs.writeFileSync(fullFilePath, file.content);
+            currentFiles.push(filename);
-        totalTokens += result.totalTokens;
-        totalPages += result.totalPages;
-        totalErrors += result.errors;
-      } catch (error) {
-        console.error(
-          `❌ Error processing ${originConfig.origin}:`,
-          error.message
-        );
-        totalErrors++;
-      }
-    }
+            console.log(
+              `📝 Wrote: ${path.join(sourceConfig.outDir, filename)} (${
+                file.tokens
+              } tokens)`
+            );
+          }
-    // Process custom URLs
-    if (config.customUrls.length > 0) {
-      console.log(`\n📋 Processing ${config.customUrls.length} custom URLs...`);
-      const customFiles = await processCustomUrls(config.customUrls, apiKey);
+          // Clean up old files for this source
+          if (previousManifest.files.length > 0) {
+            cleanupOldFiles(
+              sourceConfig.outDir,
+              currentFiles,
+              previousManifest.files
+            );
+          }
-      for (const [filename, file] of Object.entries(customFiles)) {
-        const filePath = path.join(config.outDir, filename);
-        fs.writeFileSync(filePath, file.content);
-        currentFiles.push(filename);
-        totalTokens += file.tokens;
-        totalPages++;
+          // Save manifest for this source
+          const newManifest = {
+            files: currentFiles,
+            timestamp: new Date().toISOString(),
+          };
+          saveManifest(sourceConfig.outDir, newManifest);
+        } else {
+          console.log(
+            `📋 Keeping original URLs - not saving files locally for ${sourceName}`
+          );
+        }
-        console.log(`📝 Wrote: ${filename} (${file.tokens} tokens)`);
+        // Add to all sources for combined llms.txt
+        allSources.push({
+          title: sourceConfig.title,
+          files: sourceFiles,
+          keepOriginalUrls: sourceConfig.keepOriginalUrls,
+        });
+      } catch (error) {
+        console.error(`❌ Error processing ${sourceName}:`, error.message);
+        totalErrors++;
       }
     }
-    // Clean up old files
-    if (previousManifest.files.length > 0) {
-      cleanupOldFiles(config.outDir, currentFiles, previousManifest.files);
+    // Generate and write combined llms.txt to top-level outDir
+    if (allSources.length > 0) {
+      const combinedLlmsTxt = generateCombinedLlmsTxt(allSources);
+      const combinedLlmsTxtPath = path.join(config.outDir, "llms.txt");
+      fs.writeFileSync(combinedLlmsTxtPath, combinedLlmsTxt);
+      console.log(`\n📋 Generated combined llms.txt: ${combinedLlmsTxtPath}`);
     }
-    // Save new manifest
-    const newManifest = {
-      files: currentFiles,
-      timestamp: new Date().toISOString(),
-    };
-    saveManifest(config.outDir, newManifest);
     console.log("\n✨ Extraction completed!");
     console.log(`📊 Total: ${totalPages} pages, ${totalTokens} tokens`);
     if (totalErrors > 0) {
       console.log(`⚠️  Errors: ${totalErrors}`);
     }
-    console.log(`📁 Output directory: ${path.resolve(config.outDir)}`);
+    console.log(
+      `📁 Top-level output directory: ${path.resolve(config.outDir)}`
+    );
     console.log("\n💡 Use --clear-credentials to remove stored API key");
   } catch (error) {
     console.error("💥 Fatal error:", error.message);

package/mod.js CHANGED Viewed

@@ -8,6 +8,7 @@
  * @property {boolean} extracted - Whether the content was extracted or directly fetched
  * @property {number} status - HTTP status code or processing status
  * @property {number} tokens - Number of tokens in the content
+ * @property {string} originalUrl - The original URL of the content
  */
 /**
@@ -62,6 +63,7 @@ export async function extractFromSitemap(origin, forceExtract = false, apiKey) {
           status: result.status,
           tokens: Math.round(result.content.length / 5),
           publishedDate: result.publishedDate || "",
+          originalUrl: urlStr,
           error: result.error,
         };
@@ -80,6 +82,7 @@ export async function extractFromSitemap(origin, forceExtract = false, apiKey) {
           status: 0,
           tokens: 0,
           publishedDate: "",
+          originalUrl: urlStr,
         };
         if (!forceExtract) {
           urlsNeedingExtract.push(urlStr);
@@ -108,6 +111,7 @@ export async function extractFromSitemap(origin, forceExtract = false, apiKey) {
           status: 0,
           tokens: 0,
           publishedDate: "",
+          originalUrl: result.url,
         };
         const content = result.full_content || existing.content;
@@ -122,6 +126,7 @@ export async function extractFromSitemap(origin, forceExtract = false, apiKey) {
           publishedDate: result.published_date || existing.publishedDate,
           status: existing.status,
           tokens: Math.round(content.length / 5),
+          originalUrl: existing.originalUrl,
         };
       }
@@ -137,18 +142,6 @@ export async function extractFromSitemap(origin, forceExtract = false, apiKey) {
     }
   }
-  // Generate llms.txt
-  const llmsTxt = generateLlmsTxt(origin, files);
-  files["/llms.txt"] = {
-    content: llmsTxt,
-    title: "LLMs.txt",
-    description: "LLM-friendly content listing",
-    extracted: false,
-    publishedDate: "",
-    status: 200,
-    tokens: Math.round(llmsTxt.length / 5),
-  };
   // Sort files by path
   const sortedFiles = Object.keys(files)
     .sort()
@@ -162,7 +155,7 @@ export async function extractFromSitemap(origin, forceExtract = false, apiKey) {
     (sum, file) => sum + file.tokens,
     0
   );
-  const totalPages = Object.keys(sortedFiles).length - 1; // Exclude llms.txt from page count
+  const totalPages = Object.keys(sortedFiles).length;
   const errors = Object.values(sortedFiles).filter((file) => file.error).length;
   const processingTimeMs = Date.now() - startTime;
@@ -527,44 +520,6 @@ function getPathFromUrl(urlStr) {
   }
 }
-/**
- * Generate llms.txt content
- * @param {string} origin - Site origin
- * @param {Record<string, any>} files - Files object
- * @returns {string} Generated llms.txt content
- */
-function generateLlmsTxt(origin, files) {
-  // Find homepage for top-level description
-  const homepageFile = files["/index.html.md"] || files[Object.keys(files)[0]];
-  const siteTitle =
-    homepageFile?.title ||
-    new URL(origin.startsWith("http") ? origin : `https://${origin}`).hostname;
-  const siteDescription =
-    homepageFile?.description || `Documentation for ${siteTitle}`;
-  let llmsTxt = `# ${siteTitle}\n\n> ${siteDescription}\n\n`;
-  // Add documentation section
-  llmsTxt += "## Documentation\n\n";
-  // Sort files by path for consistent ordering
-  const sortedFiles = Object.entries(files)
-    .filter(([path]) => path !== "/llms.txt")
-    .sort(([a], [b]) => a.localeCompare(b));
-  for (const [path, file] of sortedFiles) {
-    if (file.content || file.title) {
-      const title = file.title || path.replace(".md", "");
-      const description = file.description ? `: ${file.description}` : "";
-      llmsTxt += `- [${title}](${path.replace(".md", "")}) (${
-        file.tokens
-      } tokens)${description}\n`;
-    }
-  }
-  return llmsTxt;
-}
 /**
  * Call Parallel Extract API for multiple URLs
  * @param {string[]} urls - URLs to extract

package/package.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "name": "extract-from-sitemap",
   "bin": "cli.js",
-  "version": "0.0.5",
+  "version": "0.0.7",
   "main": "mod.js",
   "description": "A module and CLI that allows extracting all pages from a sitemap into markdown and a llms.txt, using Parallel.ai APIs.",
   "files": [