npm - extract-from-sitemap - Versions diffs - 0.0.6 → 0.0.8 - Mend

extract-from-sitemap 0.0.6 → 0.0.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/cli.js CHANGED Viewed

@@ -11,17 +11,18 @@ const { extractFromSitemap } = require("./mod.js");
 /**
  * @typedef {Object} SourceConfig
+ * @property {string} title - The title for this source
  * @property {string} [origin] - The origin URL to process (optional)
- * @property {boolean} forceExtract - Whether to force extraction for this source
- * @property {string} outDir - Output directory for this source's extracted files
- * @property {Array<{title: string, description: string, url: string}>} [customUrls] - Custom URLs to extract for this source
+ * @property {string} [outDir] - Output directory for this source's extracted files
+ * @property {boolean} [forceExtract] - Whether to force extraction for this source
+ * @property {boolean} [keepOriginalUrls] - Whether to keep original URL structure and not save files locally
+ * @property {Array<{title: string, description: string, filename: string, url: string}>} [customUrls] - Custom URLs to extract for this source
  */
 /**
  * @typedef {Object} Config
  * @property {string} outDir - Top-level output directory for combined llms.txt
  * @property {SourceConfig[]} sources - Array of source configurations
- * @property {boolean} keepOriginalUrls - Whether to keep original URL structure
  */
 /**
@@ -220,28 +221,47 @@ async function loadConfig() {
           outDir: "./docs",
           sources: [
             {
+              title: "Parallel AI Documentation",
               origin: "https://docs.parallel.ai",
               forceExtract: false,
               outDir: "./docs/parallel-docs",
+              keepOriginalUrls: false,
             },
             {
+              title: "Parallel AI Website",
               origin: "https://parallel.ai",
               forceExtract: true,
               outDir: "./docs/parallel-main",
+              keepOriginalUrls: false,
             },
             {
+              title: "Custom Resources",
               forceExtract: true,
               outDir: "./docs/custom",
+              keepOriginalUrls: false,
               customUrls: [
                 {
                   title: "Custom Page",
                   description: "A custom page to extract",
+                  filename: "custom-page",
                   url: "https://example.com/page",
                 },
               ],
             },
+            {
+              title: "External References",
+              keepOriginalUrls: true,
+              forceExtract: false,
+              customUrls: [
+                {
+                  title: "External API Guide",
+                  description: "Third-party API documentation",
+                  filename: "external-api",
+                  url: "https://external.com/api-guide",
+                },
+              ],
+            },
           ],
-          keepOriginalUrls: false,
         },
         null,
         2
@@ -263,12 +283,15 @@ async function loadConfig() {
       if (typeof sourceConfig !== "object" || sourceConfig === null) {
         throw new Error(`sources[${index}] must be an object`);
       }
-      if (!sourceConfig.outDir) {
-        throw new Error(`sources[${index}].outDir is required`);
-      }
-      if (typeof sourceConfig.forceExtract !== "boolean") {
-        throw new Error(`sources[${index}].forceExtract must be a boolean`);
+      if (!sourceConfig.title) {
+        throw new Error(`sources[${index}].title is required`);
       }
+      // Set defaults
+      sourceConfig.forceExtract = sourceConfig.forceExtract ?? false;
+      sourceConfig.keepOriginalUrls = sourceConfig.keepOriginalUrls ?? false;
+      sourceConfig.customUrls = sourceConfig.customUrls || [];
       // Either origin or customUrls must be provided
       if (
         !sourceConfig.origin &&
@@ -278,14 +301,29 @@ async function loadConfig() {
           `sources[${index}] must have either origin or customUrls`
         );
       }
-    }
-    // Set defaults
-    config.keepOriginalUrls = config.keepOriginalUrls ?? false;
+      // outDir is required unless keepOriginalUrls is true
+      if (!sourceConfig.outDir && !sourceConfig.keepOriginalUrls) {
+        throw new Error(
+          `sources[${index}].outDir is required when keepOriginalUrls is false`
+        );
+      }
-    // Set default customUrls for each source
-    for (const sourceConfig of config.sources) {
-      sourceConfig.customUrls = sourceConfig.customUrls || [];
+      // Validate customUrls
+      for (const [urlIndex, customUrl] of (
+        sourceConfig.customUrls || []
+      ).entries()) {
+        if (
+          !customUrl.title ||
+          !customUrl.description ||
+          !customUrl.filename ||
+          !customUrl.url
+        ) {
+          throw new Error(
+            `sources[${index}].customUrls[${urlIndex}] must have title, description, filename, and url`
+          );
+        }
+      }
     }
     return config;
@@ -421,7 +459,7 @@ function cleanupOldFiles(outDir, currentFiles, previousFiles) {
 /**
  * Process custom URLs through extraction API
- * @param {Array<{title: string, description: string, url: string}>} customUrls - Custom URLs to process
+ * @param {Array<{title: string, description: string, filename: string, url: string}>} customUrls - Custom URLs to process
  * @param {string} apiKey - API key for authentication
  * @returns {Promise<Record<string, any>>} Extracted files
  */
@@ -449,8 +487,7 @@ async function processCustomUrls(customUrls, apiKey) {
         const result = await response.json();
         if (result.results && result.results.length > 0) {
           const extracted = result.results[0];
-          const filename =
-            customUrl.title.replace(/[^a-zA-Z0-9]/g, "_").toLowerCase() + ".md";
+          const filename = customUrl.filename + ".md";
           files[filename] = {
             content: extracted.full_content || "",
@@ -460,6 +497,7 @@ async function processCustomUrls(customUrls, apiKey) {
             publishedDate: extracted.published_date || "",
             status: 200,
             tokens: Math.round((extracted.full_content || "").length / 5),
+            originalUrl: customUrl.url,
           };
         }
       } else {
@@ -478,7 +516,7 @@ async function processCustomUrls(customUrls, apiKey) {
 /**
  * Generate combined llms.txt from all sources
- * @param {Array<{sourceName: string, files: Record<string, any>, origin?: string}>} allSources - All processed sources
+ * @param {Array<{title: string, files: Record<string, any>, keepOriginalUrls?: boolean}>} allSources - All processed sources
  * @returns {string} Combined llms.txt content
  */
 function generateCombinedLlmsTxt(allSources) {
@@ -486,28 +524,22 @@ function generateCombinedLlmsTxt(allSources) {
     "# Documentation Collection\n\n> Combined documentation from multiple sources\n\n";
   for (const source of allSources) {
-    const sourceName = source.origin
-      ? new URL(
-          source.origin.startsWith("http")
-            ? source.origin
-            : `https://${source.origin}`
-        ).hostname
-      : source.sourceName;
-    combinedTxt += `## ${sourceName}\n\n`;
+    combinedTxt += `## ${source.title}\n\n`;
     // Sort files by path for consistent ordering
-    const sortedFiles = Object.entries(source.files)
-      .filter(([path]) => path !== "/llms.txt")
-      .sort(([a], [b]) => a.localeCompare(b));
+    const sortedFiles = Object.entries(source.files).sort(([a], [b]) =>
+      a.localeCompare(b)
+    );
     for (const [path, file] of sortedFiles) {
       if (file.content || file.title) {
         const title = file.title || path.replace(".md", "");
         const description = file.description ? `: ${file.description}` : "";
-        combinedTxt += `- [${title}](${path.replace(".md", "")}) (${
-          file.tokens
-        } tokens)${description}\n`;
+        // If keepOriginalUrls is true, link to the original URL, otherwise link to the local file
+        const link = source.keepOriginalUrls ? file.originalUrl : path;
+        combinedTxt += `- [${title}](${link}) (${file.tokens} tokens)${description}\n`;
       }
     }
@@ -560,19 +592,21 @@ async function main() {
     // Process each source
     for (const [sourceIndex, sourceConfig] of config.sources.entries()) {
-      const sourceName = sourceConfig.origin
-        ? `source ${sourceIndex + 1} (${sourceConfig.origin})`
-        : `source ${sourceIndex + 1} (custom URLs)`;
+      const sourceName = `${sourceConfig.title} (source ${sourceIndex + 1})`;
       console.log(
-        `\n🌐 Processing ${sourceName} (forceExtract: ${sourceConfig.forceExtract})`
+        `\n🌐 Processing ${sourceName} (forceExtract: ${sourceConfig.forceExtract}, keepOriginalUrls: ${sourceConfig.keepOriginalUrls})`
       );
-      // Ensure source output directory exists
-      fs.mkdirSync(sourceConfig.outDir, { recursive: true });
+      // Only ensure source output directory exists if not keeping original URLs
+      if (!sourceConfig.keepOriginalUrls && sourceConfig.outDir) {
+        fs.mkdirSync(sourceConfig.outDir, { recursive: true });
+      }
-      // Load previous manifest for this source
-      const previousManifest = loadManifest(sourceConfig.outDir);
+      // Load previous manifest for this source (only if we have an outDir)
+      const previousManifest = sourceConfig.outDir
+        ? loadManifest(sourceConfig.outDir)
+        : { files: [], timestamp: new Date().toISOString() };
       const currentFiles = [];
       let sourceFiles = {};
@@ -617,53 +651,53 @@ async function main() {
           }
         }
-        // Write files to source directory
-        for (const [filePath, file] of Object.entries(sourceFiles)) {
-          let filename = filePath;
+        // Write files to source directory (only if not keeping original URLs)
+        if (!sourceConfig.keepOriginalUrls && sourceConfig.outDir) {
+          for (const [filePath, file] of Object.entries(sourceFiles)) {
+            let filename = filePath.startsWith("/")
+              ? filePath.slice(1)
+              : filePath;
-          if (!config.keepOriginalUrls && sourceConfig.origin) {
-            // Use relative path within source directory
-            filename = filePath.startsWith("/") ? filePath.slice(1) : filePath;
-          } else if (!sourceConfig.origin) {
-            // For custom URL sources, use simple filename
-            filename = filePath.startsWith("/") ? filePath.slice(1) : filePath;
-          }
+            const fullFilePath = path.join(sourceConfig.outDir, filename);
+            const fileDir = path.dirname(fullFilePath);
-          const fullFilePath = path.join(sourceConfig.outDir, filename);
-          const fileDir = path.dirname(fullFilePath);
+            fs.mkdirSync(fileDir, { recursive: true });
+            fs.writeFileSync(fullFilePath, file.content);
+            currentFiles.push(filename);
-          fs.mkdirSync(fileDir, { recursive: true });
-          fs.writeFileSync(fullFilePath, file.content);
-          currentFiles.push(filename);
+            console.log(
+              `📝 Wrote: ${path.join(sourceConfig.outDir, filename)} (${
+                file.tokens
+              } tokens)`
+            );
+          }
-          console.log(
-            `📝 Wrote: ${path.join(sourceConfig.outDir, filename)} (${
-              file.tokens
-            } tokens)`
-          );
-        }
+          // Clean up old files for this source
+          if (previousManifest.files.length > 0) {
+            cleanupOldFiles(
+              sourceConfig.outDir,
+              currentFiles,
+              previousManifest.files
+            );
+          }
-        // Clean up old files for this source
-        if (previousManifest.files.length > 0) {
-          cleanupOldFiles(
-            sourceConfig.outDir,
-            currentFiles,
-            previousManifest.files
+          // Save manifest for this source
+          const newManifest = {
+            files: currentFiles,
+            timestamp: new Date().toISOString(),
+          };
+          saveManifest(sourceConfig.outDir, newManifest);
+        } else {
+          console.log(
+            `📋 Keeping original URLs - not saving files locally for ${sourceName}`
           );
         }
-        // Save manifest for this source
-        const newManifest = {
-          files: currentFiles,
-          timestamp: new Date().toISOString(),
-        };
-        saveManifest(sourceConfig.outDir, newManifest);
         // Add to all sources for combined llms.txt
         allSources.push({
-          sourceName: `Source ${sourceIndex + 1}`,
-          origin: sourceConfig.origin,
+          title: sourceConfig.title,
           files: sourceFiles,
+          keepOriginalUrls: sourceConfig.keepOriginalUrls,
         });
       } catch (error) {
         console.error(`❌ Error processing ${sourceName}:`, error.message);

package/mod.js CHANGED Viewed

@@ -8,6 +8,7 @@
  * @property {boolean} extracted - Whether the content was extracted or directly fetched
  * @property {number} status - HTTP status code or processing status
  * @property {number} tokens - Number of tokens in the content
+ * @property {string} originalUrl - The original URL of the content
  */
 /**
@@ -62,6 +63,7 @@ export async function extractFromSitemap(origin, forceExtract = false, apiKey) {
           status: result.status,
           tokens: Math.round(result.content.length / 5),
           publishedDate: result.publishedDate || "",
+          originalUrl: urlStr,
           error: result.error,
         };
@@ -80,6 +82,7 @@ export async function extractFromSitemap(origin, forceExtract = false, apiKey) {
           status: 0,
           tokens: 0,
           publishedDate: "",
+          originalUrl: urlStr,
         };
         if (!forceExtract) {
           urlsNeedingExtract.push(urlStr);
@@ -108,6 +111,7 @@ export async function extractFromSitemap(origin, forceExtract = false, apiKey) {
           status: 0,
           tokens: 0,
           publishedDate: "",
+          originalUrl: result.url,
         };
         const content = result.full_content || existing.content;
@@ -122,6 +126,7 @@ export async function extractFromSitemap(origin, forceExtract = false, apiKey) {
           publishedDate: result.published_date || existing.publishedDate,
           status: existing.status,
           tokens: Math.round(content.length / 5),
+          originalUrl: existing.originalUrl,
         };
       }
@@ -137,18 +142,6 @@ export async function extractFromSitemap(origin, forceExtract = false, apiKey) {
     }
   }
-  // Generate llms.txt
-  const llmsTxt = generateLlmsTxt(origin, files);
-  files["/llms.txt"] = {
-    content: llmsTxt,
-    title: "LLMs.txt",
-    description: "LLM-friendly content listing",
-    extracted: false,
-    publishedDate: "",
-    status: 200,
-    tokens: Math.round(llmsTxt.length / 5),
-  };
   // Sort files by path
   const sortedFiles = Object.keys(files)
     .sort()
@@ -162,7 +155,7 @@ export async function extractFromSitemap(origin, forceExtract = false, apiKey) {
     (sum, file) => sum + file.tokens,
     0
   );
-  const totalPages = Object.keys(sortedFiles).length - 1; // Exclude llms.txt from page count
+  const totalPages = Object.keys(sortedFiles).length;
   const errors = Object.values(sortedFiles).filter((file) => file.error).length;
   const processingTimeMs = Date.now() - startTime;
@@ -527,44 +520,6 @@ function getPathFromUrl(urlStr) {
   }
 }
-/**
- * Generate llms.txt content
- * @param {string} origin - Site origin
- * @param {Record<string, any>} files - Files object
- * @returns {string} Generated llms.txt content
- */
-function generateLlmsTxt(origin, files) {
-  // Find homepage for top-level description
-  const homepageFile = files["/index.html.md"] || files[Object.keys(files)[0]];
-  const siteTitle =
-    homepageFile?.title ||
-    new URL(origin.startsWith("http") ? origin : `https://${origin}`).hostname;
-  const siteDescription =
-    homepageFile?.description || `Documentation for ${siteTitle}`;
-  let llmsTxt = `# ${siteTitle}\n\n> ${siteDescription}\n\n`;
-  // Add documentation section
-  llmsTxt += "## Documentation\n\n";
-  // Sort files by path for consistent ordering
-  const sortedFiles = Object.entries(files)
-    .filter(([path]) => path !== "/llms.txt")
-    .sort(([a], [b]) => a.localeCompare(b));
-  for (const [path, file] of sortedFiles) {
-    if (file.content || file.title) {
-      const title = file.title || path.replace(".md", "");
-      const description = file.description ? `: ${file.description}` : "";
-      llmsTxt += `- [${title}](${path.replace(".md", "")}) (${
-        file.tokens
-      } tokens)${description}\n`;
-    }
-  }
-  return llmsTxt;
-}
 /**
  * Call Parallel Extract API for multiple URLs
  * @param {string[]} urls - URLs to extract

package/package.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "name": "extract-from-sitemap",
   "bin": "cli.js",
-  "version": "0.0.6",
+  "version": "0.0.8",
   "main": "mod.js",
   "description": "A module and CLI that allows extracting all pages from a sitemap into markdown and a llms.txt, using Parallel.ai APIs.",
   "files": [