npm - extract-from-sitemap - Versions diffs - 0.0.3 → 0.0.5 - Mend

extract-from-sitemap 0.0.3 → 0.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (2) hide show

package/cli.js +44 -40
package/package.json +1 -1

package/cli.js CHANGED Viewed

@@ -7,14 +7,20 @@ const crypto = require("crypto");
 const http = require("http");
 const { URL, URLSearchParams } = require("url");
 const os = require("os");
+const { extractFromSitemap } = require("./mod.js");
+/**
+ * @typedef {Object} OriginConfig
+ * @property {string} origin - The origin URL to process
+ * @property {boolean} forceExtract - Whether to force extraction for this origin
+ */
 /**
  * @typedef {Object} Config
  * @property {string} outDir - Output directory for extracted files
- * @property {string[]} origins - Array of origin URLs to process
+ * @property {OriginConfig[]} origins - Array of origin configurations
  * @property {Array<{title: string, description: string, url: string}>} customUrls - Custom URLs to extract
  * @property {boolean} keepOriginalUrls - Whether to keep original URL structure
- * @property {boolean} forceExtract - Whether to force extraction even if files exist
  */
 /**
@@ -209,11 +215,14 @@ async function loadConfig() {
     console.log(
       JSON.stringify(
         {
+          $schema: "https://extract.llmtext.com/llmtext.schema.json",
           outDir: "./docs",
-          origins: ["https://docs.example.com"],
+          origins: [
+            { origin: "https://docs.parallel.ai", forceExtract: false },
+            { origin: "https://parallel.ai", forceExtract: true },
+          ],
           customUrls: [],
           keepOriginalUrls: false,
-          forceExtract: false,
         },
         null,
         2
@@ -230,10 +239,22 @@ async function loadConfig() {
     if (!Array.isArray(config.origins))
       throw new Error("origins must be an array");
+    // Validate origin objects
+    for (const [index, originConfig] of config.origins.entries()) {
+      if (typeof originConfig !== "object" || originConfig === null) {
+        throw new Error(`origins[${index}] must be an object`);
+      }
+      if (!originConfig.origin) {
+        throw new Error(`origins[${index}].origin is required`);
+      }
+      if (typeof originConfig.forceExtract !== "boolean") {
+        throw new Error(`origins[${index}].forceExtract must be a boolean`);
+      }
+    }
     // Set defaults
     config.customUrls = config.customUrls || [];
     config.keepOriginalUrls = config.keepOriginalUrls ?? false;
-    config.forceExtract = config.forceExtract ?? false;
     return config;
   } catch (error) {
@@ -370,10 +391,9 @@ function cleanupOldFiles(outDir, currentFiles, previousFiles) {
  * Process custom URLs through extraction API
  * @param {Array<{title: string, description: string, url: string}>} customUrls - Custom URLs to process
  * @param {string} apiKey - API key for authentication
- * @param {boolean} forceExtract - Whether to force extraction
  * @returns {Promise<Record<string, any>>} Extracted files
  */
-async function processCustomUrls(customUrls, apiKey, forceExtract) {
+async function processCustomUrls(customUrls, apiKey) {
   const files = {};
   for (const customUrl of customUrls) {
@@ -410,6 +430,8 @@ async function processCustomUrls(customUrls, apiKey, forceExtract) {
             tokens: Math.round((extracted.full_content || "").length / 5),
           };
         }
+      } else {
+        throw new Error(`${response.status} - ${await response.statusText()}`);
       }
     } catch (error) {
       console.error(
@@ -438,27 +460,6 @@ async function clearCredentials() {
   }
 }
-/**
- * Extract content from sitemap (placeholder - you'll need to implement this)
- * @param {string} origin - The origin URL
- * @param {boolean} forceExtract - Whether to force extraction
- * @param {string} apiKey - API key for authentication
- * @returns {Promise<{totalPages: number, totalTokens: number, errors: number, files: Record<string, any>}>}
- */
-async function extractFromSitemap(origin, forceExtract, apiKey) {
-  // This is a placeholder - you'll need to implement the actual extraction logic
-  // or import it from your mod.js file
-  console.log(`Extracting from ${origin} (force: ${forceExtract})`);
-  // For now, return empty result
-  return {
-    totalPages: 0,
-    totalTokens: 0,
-    errors: 0,
-    files: {},
-  };
-}
 /**
  * Main function
  */
@@ -487,14 +488,16 @@ async function main() {
     let totalPages = 0;
     let totalErrors = 0;
-    // Process each origin
-    for (const origin of config.origins) {
-      console.log(`\n🌐 Processing origin: ${origin}`);
+    // Process each origin with its own forceExtract setting
+    for (const originConfig of config.origins) {
+      console.log(
+        `\n🌐 Processing origin: ${originConfig.origin} (forceExtract: ${originConfig.forceExtract})`
+      );
       try {
         const result = await extractFromSitemap(
-          origin,
-          config.forceExtract,
+          originConfig.origin,
+          originConfig.forceExtract,
           apiKey
         );
@@ -512,7 +515,9 @@ async function main() {
           if (!config.keepOriginalUrls) {
             // Create domain-specific subdirectory
             const domain = new URL(
-              origin.startsWith("http") ? origin : `https://${origin}`
+              originConfig.origin.startsWith("http")
+                ? originConfig.origin
+                : `https://${originConfig.origin}`
             ).hostname;
             const domainDir = path.join(config.outDir, domain);
             fs.mkdirSync(domainDir, { recursive: true });
@@ -538,7 +543,10 @@ async function main() {
         totalPages += result.totalPages;
         totalErrors += result.errors;
       } catch (error) {
-        console.error(`❌ Error processing ${origin}:`, error.message);
+        console.error(
+          `❌ Error processing ${originConfig.origin}:`,
+          error.message
+        );
         totalErrors++;
       }
     }
@@ -546,11 +554,7 @@ async function main() {
     // Process custom URLs
     if (config.customUrls.length > 0) {
       console.log(`\n📋 Processing ${config.customUrls.length} custom URLs...`);
-      const customFiles = await processCustomUrls(
-        config.customUrls,
-        apiKey,
-        config.forceExtract
-      );
+      const customFiles = await processCustomUrls(config.customUrls, apiKey);
       for (const [filename, file] of Object.entries(customFiles)) {
         const filePath = path.join(config.outDir, filename);

package/package.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "name": "extract-from-sitemap",
   "bin": "cli.js",
-  "version": "0.0.3",
+  "version": "0.0.5",
   "main": "mod.js",
   "description": "A module and CLI that allows extracting all pages from a sitemap into markdown and a llms.txt, using Parallel.ai APIs.",
   "files": [