npm - extract-from-sitemap - Versions diffs - 0.0.1 - Mend

extract-from-sitemap 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/cli.ts ADDED Viewed

@@ -0,0 +1,434 @@
+#!/usr/bin/env bun
+/// <reference types="@types/bun" />
+/// <reference lib="esnext" />
+import {
+  existsSync,
+  readFileSync,
+  writeFileSync,
+  mkdirSync,
+  rmSync,
+  readdirSync,
+} from "fs";
+import { join, dirname, resolve } from "path";
+import { extractFromSitemap } from "./mod.js";
+interface Config {
+  outDir: string;
+  origins: string[];
+  customUrls: Array<{
+    title: string;
+    description: string;
+    url: string;
+  }>;
+  keepOriginalUrls: boolean;
+  forceExtract: boolean;
+}
+interface Manifest {
+  files: string[];
+  timestamp: string;
+}
+class OAuth {
+  private clientId: string;
+  private redirectUri: string;
+  private scope: string;
+  constructor() {
+    this.clientId = "extract-from-sitemap-cli";
+    this.redirectUri = "http://localhost:3737/callback";
+    this.scope = "key:read";
+  }
+  async getApiKey(): Promise<string> {
+    console.log("🔐 Starting OAuth flow...");
+    // Generate PKCE parameters
+    const { codeVerifier, codeChallenge } = await this.generatePKCE();
+    // Build authorization URL
+    const authUrl = new URL("https://platform.parallel.ai/getKeys/authorize");
+    authUrl.searchParams.set("client_id", this.clientId);
+    authUrl.searchParams.set("redirect_uri", this.redirectUri);
+    authUrl.searchParams.set("response_type", "code");
+    authUrl.searchParams.set("scope", this.scope);
+    authUrl.searchParams.set("code_challenge", codeChallenge);
+    authUrl.searchParams.set("code_challenge_method", "S256");
+    authUrl.searchParams.set("state", Math.random().toString(36));
+    console.log(`\n📖 Please visit this URL to authorize the application:`);
+    console.log(`${authUrl.toString()}\n`);
+    // Start simple HTTP server to catch the callback
+    const code = await this.startCallbackServer();
+    // Exchange code for token
+    console.log("🔄 Exchanging authorization code for API key...");
+    const response = await fetch("https://platform.parallel.ai/getKeys/token", {
+      method: "POST",
+      headers: { "Content-Type": "application/x-www-form-urlencoded" },
+      body: new URLSearchParams({
+        grant_type: "authorization_code",
+        code: code,
+        client_id: this.clientId,
+        redirect_uri: this.redirectUri,
+        code_verifier: codeVerifier,
+      }),
+    });
+    if (!response.ok) {
+      throw new Error(
+        `Token exchange failed: ${response.status} ${response.statusText}`
+      );
+    }
+    const { access_token } = await response.json();
+    console.log("✅ Successfully obtained API key!");
+    return access_token;
+  }
+  private async generatePKCE(): Promise<{
+    codeVerifier: string;
+    codeChallenge: string;
+  }> {
+    const codeVerifier = btoa(
+      String.fromCharCode(...crypto.getRandomValues(new Uint8Array(32)))
+    ).replace(/[+/=]/g, (m) => ({ "+": "-", "/": "_", "=": "" }[m]));
+    const hash = await crypto.subtle.digest(
+      "SHA-256",
+      new TextEncoder().encode(codeVerifier)
+    );
+    const codeChallenge = btoa(
+      String.fromCharCode(...new Uint8Array(hash))
+    ).replace(/[+/=]/g, (m) => ({ "+": "-", "/": "_", "=": "" }[m]));
+    return { codeVerifier, codeChallenge };
+  }
+  private async startCallbackServer(): Promise<string> {
+    return new Promise((resolve, reject) => {
+      const server = Bun.serve({
+        port: 3737,
+        fetch(req) {
+          const url = new URL(req.url);
+          if (url.pathname === "/callback") {
+            const code = url.searchParams.get("code");
+            const error = url.searchParams.get("error");
+            if (error) {
+              reject(new Error(`OAuth error: ${error}`));
+              return new Response(
+                "Error occurred. You can close this window.",
+                { status: 400 }
+              );
+            }
+            if (code) {
+              resolve(code);
+              server.stop();
+              return new Response(
+                "✅ Authorization successful! You can close this window and return to the terminal."
+              );
+            }
+          }
+          return new Response("Invalid request", { status: 404 });
+        },
+      });
+      // Timeout after 5 minutes
+      setTimeout(() => {
+        server.stop();
+        reject(new Error("OAuth flow timed out"));
+      }, 300000);
+    });
+  }
+}
+async function loadConfig(): Promise<Config> {
+  const configPath = resolve("llmtext.json");
+  if (!existsSync(configPath)) {
+    console.error(
+      "❌ llmtext.json not found. Please create a configuration file."
+    );
+    console.log("\nExample llmtext.json:");
+    console.log(
+      JSON.stringify(
+        {
+          outDir: "./docs",
+          origins: ["https://docs.example.com"],
+          customUrls: [],
+          keepOriginalUrls: false,
+          forceExtract: false,
+        },
+        null,
+        2
+      )
+    );
+    process.exit(1);
+  }
+  try {
+    const config = JSON.parse(readFileSync(configPath, "utf8")) as Config;
+    // Validate required fields
+    if (!config.outDir) throw new Error("outDir is required");
+    if (!Array.isArray(config.origins))
+      throw new Error("origins must be an array");
+    // Set defaults
+    config.customUrls = config.customUrls || [];
+    config.keepOriginalUrls = config.keepOriginalUrls ?? false;
+    config.forceExtract = config.forceExtract ?? false;
+    return config;
+  } catch (error) {
+    console.error("❌ Error reading llmtext.json:", error.message);
+    process.exit(1);
+  }
+}
+async function getApiKey(): Promise<string> {
+  // Check environment variables first
+  let apiKey = process.env.PARALLEL_API_KEY;
+  if (!apiKey && existsSync(".env")) {
+    // Try to load from .env file
+    const envContent = readFileSync(".env", "utf8");
+    const match = envContent.match(/^PARALLEL_API_KEY=(.+)$/m);
+    if (match) {
+      apiKey = match[1].trim();
+    }
+  }
+  if (!apiKey) {
+    console.log("🔑 No API key found in environment or .env file.");
+    const oauth = new OAuth();
+    apiKey = await oauth.getApiKey();
+  }
+  return apiKey;
+}
+function loadManifest(outDir: string): Manifest {
+  const manifestPath = join(outDir, "llmtext-manifest.json");
+  if (!existsSync(manifestPath)) {
+    return { files: [], timestamp: new Date().toISOString() };
+  }
+  try {
+    return JSON.parse(readFileSync(manifestPath, "utf8"));
+  } catch {
+    return { files: [], timestamp: new Date().toISOString() };
+  }
+}
+function saveManifest(outDir: string, manifest: Manifest): void {
+  const manifestPath = join(outDir, "llmtext-manifest.json");
+  writeFileSync(manifestPath, JSON.stringify(manifest, null, 2));
+}
+function cleanupOldFiles(
+  outDir: string,
+  currentFiles: string[],
+  previousFiles: string[]
+): void {
+  const filesToRemove = previousFiles.filter(
+    (file) => !currentFiles.includes(file)
+  );
+  for (const file of filesToRemove) {
+    const filePath = join(outDir, file);
+    try {
+      if (existsSync(filePath)) {
+        rmSync(filePath);
+        console.log(`🗑️  Removed old file: ${file}`);
+      }
+    } catch (error) {
+      console.warn(`⚠️  Could not remove ${file}:`, error.message);
+    }
+  }
+}
+async function processCustomUrls(
+  customUrls: Array<{ title: string; description: string; url: string }>,
+  apiKey: string,
+  forceExtract: boolean
+): Promise<Record<string, any>> {
+  const files: Record<string, any> = {};
+  for (const customUrl of customUrls) {
+    console.log(`📄 Processing custom URL: ${customUrl.url}`);
+    try {
+      // For custom URLs, we need to extract them individually
+      const response = await fetch("https://api.parallel.ai/v1beta/extract", {
+        method: "POST",
+        headers: {
+          "Content-Type": "application/json",
+          "parallel-beta": "search-extract-2025-10-10",
+          "x-api-key": apiKey,
+        },
+        body: JSON.stringify({
+          urls: [customUrl.url],
+          full_content: true,
+        }),
+      });
+      if (response.ok) {
+        const result = await response.json();
+        if (result.results && result.results.length > 0) {
+          const extracted = result.results[0];
+          const filename =
+            customUrl.title.replace(/[^a-zA-Z0-9]/g, "_").toLowerCase() + ".md";
+          files[filename] = {
+            content: extracted.full_content || "",
+            title: customUrl.title,
+            description: customUrl.description,
+            extracted: true,
+            publishedDate: extracted.published_date || "",
+            status: 200,
+            tokens: Math.round((extracted.full_content || "").length / 5),
+          };
+        }
+      }
+    } catch (error) {
+      console.error(
+        `❌ Error processing custom URL ${customUrl.url}:`,
+        error.message
+      );
+    }
+  }
+  return files;
+}
+async function main() {
+  console.log("🚀 Extract from Sitemap CLI");
+  try {
+    const config = await loadConfig();
+    const apiKey = await getApiKey();
+    // Ensure output directory exists
+    mkdirSync(config.outDir, { recursive: true });
+    // Load previous manifest
+    const previousManifest = loadManifest(config.outDir);
+    const currentFiles: string[] = [];
+    let totalTokens = 0;
+    let totalPages = 0;
+    let totalErrors = 0;
+    // Process each origin
+    for (const origin of config.origins) {
+      console.log(`\n🌐 Processing origin: ${origin}`);
+      try {
+        const result = await extractFromSitemap(
+          origin,
+          config.forceExtract,
+          apiKey
+        );
+        console.log(
+          `✅ Extracted ${result.totalPages} pages with ${result.totalTokens} tokens`
+        );
+        if (result.errors > 0) {
+          console.log(`⚠️  ${result.errors} errors occurred`);
+        }
+        // Write files to disk
+        for (const [path, file] of Object.entries(result.files)) {
+          let filename = path;
+          if (!config.keepOriginalUrls) {
+            // Create domain-specific subdirectory
+            const domain = new URL(
+              origin.startsWith("http") ? origin : `https://${origin}`
+            ).hostname;
+            const domainDir = join(config.outDir, domain);
+            mkdirSync(domainDir, { recursive: true });
+            filename = join(
+              domain,
+              path.startsWith("/") ? path.slice(1) : path
+            );
+          } else {
+            filename = path.startsWith("/") ? path.slice(1) : path;
+          }
+          const filePath = join(config.outDir, filename);
+          const fileDir = dirname(filePath);
+          mkdirSync(fileDir, { recursive: true });
+          writeFileSync(filePath, file.content);
+          currentFiles.push(filename);
+          console.log(`📝 Wrote: ${filename} (${file.tokens} tokens)`);
+        }
+        totalTokens += result.totalTokens;
+        totalPages += result.totalPages;
+        totalErrors += result.errors;
+      } catch (error) {
+        console.error(`❌ Error processing ${origin}:`, error.message);
+        totalErrors++;
+      }
+    }
+    // Process custom URLs
+    if (config.customUrls.length > 0) {
+      console.log(`\n📋 Processing ${config.customUrls.length} custom URLs...`);
+      const customFiles = await processCustomUrls(
+        config.customUrls,
+        apiKey,
+        config.forceExtract
+      );
+      for (const [filename, file] of Object.entries(customFiles)) {
+        const filePath = join(config.outDir, filename);
+        writeFileSync(filePath, file.content);
+        currentFiles.push(filename);
+        totalTokens += file.tokens;
+        totalPages++;
+        console.log(`📝 Wrote: ${filename} (${file.tokens} tokens)`);
+      }
+    }
+    // Clean up old files
+    if (previousManifest.files.length > 0) {
+      cleanupOldFiles(config.outDir, currentFiles, previousManifest.files);
+    }
+    // Save new manifest
+    const newManifest: Manifest = {
+      files: currentFiles,
+      timestamp: new Date().toISOString(),
+    };
+    saveManifest(config.outDir, newManifest);
+    console.log(`\n✨ Extraction completed!`);
+    console.log(`📊 Total: ${totalPages} pages, ${totalTokens} tokens`);
+    if (totalErrors > 0) {
+      console.log(`⚠️  Errors: ${totalErrors}`);
+    }
+    console.log(`📁 Output directory: ${resolve(config.outDir)}`);
+  } catch (error) {
+    console.error("💥 Fatal error:", error.message);
+    process.exit(1);
+  }
+}
+if (import.meta.main) {
+  main();
+}

package/mod.js ADDED Viewed

@@ -0,0 +1,595 @@
+/**
+ * @typedef {Object} FileResult
+ * @property {string} [error] - Error message if file processing failed
+ * @property {string} content - The extracted or fetched content of the file
+ * @property {string} publishedDate - The published date of the file/document
+ * @property {string} title - The title of the file/document
+ * @property {string} description - The description of the file/document
+ * @property {boolean} extracted - Whether the content was extracted or directly fetched
+ * @property {number} status - HTTP status code or processing status
+ * @property {number} tokens - Number of tokens in the content
+ */
+/**
+ * @typedef {Object} ResponseData
+ * @property {Record<string, FileResult>} files - Map of file identifiers to their results
+ * @property {number} totalTokens - Total number of tokens across all files
+ * @property {number} totalPages - Total number of pages processed
+ * @property {number} errors - Number of errors encountered during processing
+ * @property {number} processingTimeMs - Total processing time in milliseconds
+ * @property {number} extractApiCallCount - Number of API calls made for content extraction
+ * @property {number} fetchCount - Number of fetch operations performed
+ */
+/**
+ * Extract content from sitemap URLs with markdown variant detection
+ * @param {string} origin - The origin URL to extract from
+ * @param {boolean} forceExtract - Whether to force using extract API instead of markdown variants
+ * @param {string} apiKey - Parallel API key
+ * @returns {Promise<ResponseData>}
+ */
+export async function extractFromSitemap(origin, forceExtract = false, apiKey) {
+  const startTime = Date.now();
+  let fetchCount = 0;
+  let extractApiCallCount = 0;
+  // Discover sitemap
+  const sitemapUrl = await discoverSitemap(origin);
+  if (!sitemapUrl) {
+    throw new Error(`Could not find sitemap for ${origin}`);
+  }
+  // Parse sitemap and get URLs
+  const urls = await parseSitemap(sitemapUrl);
+  // Process each URL
+  const files = {};
+  const urlsNeedingExtract = [];
+  // Fetch all URLs with markdown variant detection
+  await Promise.all(
+    urls.map(async (urlStr) => {
+      try {
+        const result = await fetchUrlContent(urlStr, forceExtract);
+        fetchCount += result.fetchCount;
+        const path = getPathFromUrl(urlStr) + ".md";
+        files[path] = {
+          content: result.content,
+          title: cleanTitle(result.title, origin),
+          description: cleanDescription(result.description, result.title),
+          extracted: false,
+          status: result.status,
+          tokens: Math.round(result.content.length / 5),
+          publishedDate: result.publishedDate || "",
+          error: result.error,
+        };
+        // Track URLs that need Extract API fallback
+        if (!result.content || result.error) {
+          urlsNeedingExtract.push(urlStr);
+        }
+      } catch (error) {
+        const path = getPathFromUrl(urlStr) + ".md";
+        files[path] = {
+          error: error instanceof Error ? error.message : "Unknown error",
+          content: "",
+          title: "",
+          description: "",
+          extracted: false,
+          status: 0,
+          tokens: 0,
+          publishedDate: "",
+        };
+        if (!forceExtract) {
+          urlsNeedingExtract.push(urlStr);
+        }
+      }
+    })
+  );
+  // Use Parallel Extract API for URLs that didn't return content
+  if (urlsNeedingExtract.length > 0 && apiKey) {
+    try {
+      extractApiCallCount = 1;
+      const extractResults = await callParallelExtractAPI(
+        urlsNeedingExtract,
+        apiKey
+      );
+      // Merge extract results
+      for (const result of extractResults.results) {
+        const path = getPathFromUrl(result.url) + ".md";
+        const existing = files[path] || {
+          content: "",
+          title: "",
+          description: "",
+          extracted: false,
+          status: 0,
+          tokens: 0,
+          publishedDate: "",
+        };
+        const content = result.full_content || existing.content;
+        files[path] = {
+          content,
+          title: cleanTitle(result.title || existing.title, origin),
+          description: cleanDescription(
+            existing.description,
+            result.title || existing.title
+          ),
+          extracted: !!result.full_content,
+          publishedDate: result.published_date || existing.publishedDate,
+          status: existing.status,
+          tokens: Math.round(content.length / 5),
+        };
+      }
+      // Handle extract errors
+      for (const error of extractResults.errors) {
+        const path = getPathFromUrl(error.url) + ".md";
+        if (files[path]) {
+          files[path].error = error.message;
+        }
+      }
+    } catch (error) {
+      console.error("Extract API error:", error);
+    }
+  }
+  // Generate llms.txt
+  const llmsTxt = generateLlmsTxt(origin, files);
+  files["/llms.txt"] = {
+    content: llmsTxt,
+    title: "LLMs.txt",
+    description: "LLM-friendly content listing",
+    extracted: false,
+    publishedDate: "",
+    status: 200,
+    tokens: Math.round(llmsTxt.length / 5),
+  };
+  // Sort files by path
+  const sortedFiles = Object.keys(files)
+    .sort()
+    .reduce((acc, key) => {
+      acc[key] = files[key];
+      return acc;
+    }, {});
+  // Calculate totals
+  const totalTokens = Object.values(sortedFiles).reduce(
+    (sum, file) => sum + file.tokens,
+    0
+  );
+  const totalPages = Object.keys(sortedFiles).length - 1; // Exclude llms.txt from page count
+  const errors = Object.values(sortedFiles).filter((file) => file.error).length;
+  const processingTimeMs = Date.now() - startTime;
+  return {
+    files: sortedFiles,
+    totalTokens,
+    totalPages,
+    errors,
+    processingTimeMs,
+    extractApiCallCount,
+    fetchCount,
+  };
+}
+/**
+ * Clean title by removing site name duplicates
+ * @param {string} title - Original title
+ * @param {string} origin - Site origin
+ * @returns {string} Cleaned title
+ */
+function cleanTitle(title, origin) {
+  if (!title) return "";
+  // Extract domain name from origin
+  const domain = new URL(origin).hostname.replace(/^www\./, "");
+  const siteName = domain.split(".")[0];
+  // Remove common site name patterns from end of title
+  const patterns = [
+    new RegExp(`\\s*[-|•]\\s*${siteName}\\s*$`, "i"),
+    new RegExp(`\\s*[-|•]\\s*${domain}\\s*$`, "i"),
+    /\s*[-|•]\s*Home\s*$/i,
+    /\s*[-|•]\s*Documentation\s*$/i,
+  ];
+  let cleaned = title;
+  for (const pattern of patterns) {
+    cleaned = cleaned.replace(pattern, "");
+  }
+  return cleaned.trim();
+}
+/**
+ * Clean description by removing title duplicates
+ * @param {string} description - Original description
+ * @param {string} title - Page title
+ * @returns {string} Cleaned description
+ */
+function cleanDescription(description, title) {
+  if (!description || !title) return description || "";
+  // Remove title from beginning of description if it's a duplicate
+  if (description.toLowerCase().startsWith(title.toLowerCase())) {
+    return description
+      .substring(title.length)
+      .replace(/^[.\s-]+/, "")
+      .trim();
+  }
+  return description;
+}
+/**
+ * Discover sitemap URL for a given origin
+ * @param {string} origin - The origin to search for sitemap
+ * @returns {Promise<string|null>} Sitemap URL or null if not found
+ */
+async function discoverSitemap(origin) {
+  // Ensure origin has protocol
+  const baseUrl = origin.startsWith("http") ? origin : `https://${origin}`;
+  const domain = new URL(baseUrl).origin;
+  // Try common sitemap locations
+  const candidates = [
+    `${domain}/sitemap.xml`,
+    `${domain}/sitemap_index.xml`,
+    `${domain}/sitemap-index.xml`,
+    `${domain}/sitemap1.xml`,
+  ];
+  // Also check robots.txt
+  try {
+    const robotsRes = await fetch(`${domain}/robots.txt`, {
+      headers: { "User-Agent": "sitemap-to-llmtext-bot/1.0" },
+    });
+    if (robotsRes.ok) {
+      const robotsTxt = await robotsRes.text();
+      const sitemapMatch = robotsTxt.match(/Sitemap:\s*(.+)/i);
+      if (sitemapMatch) {
+        candidates.unshift(sitemapMatch[1].trim());
+      }
+    }
+  } catch {}
+  // Test each candidate
+  for (const candidate of candidates) {
+    try {
+      const res = await fetch(candidate, {
+        headers: { "User-Agent": "sitemap-to-llmtext-bot/1.0" },
+      });
+      if (res.ok) {
+        const contentType = res.headers.get("content-type") || "";
+        if (contentType.includes("xml") || contentType.includes("text")) {
+          return candidate;
+        }
+      }
+    } catch {}
+  }
+  return null;
+}
+/**
+ * Parse sitemap XML and extract URLs
+ * @param {string} sitemapUrl - URL of the sitemap
+ * @returns {Promise<string[]>} Array of URLs found in sitemap
+ */
+async function parseSitemap(sitemapUrl) {
+  const res = await fetch(sitemapUrl, {
+    headers: { "User-Agent": "sitemap-to-llmtext-bot/1.0" },
+  });
+  if (!res.ok) {
+    throw new Error(`Failed to fetch sitemap: ${res.status}`);
+  }
+  const xml = await res.text();
+  const urls = [];
+  // Check if this is a sitemap index
+  const sitemapPattern =
+    /<sitemap>[\s\S]*?<loc>(.+?)<\/loc>[\s\S]*?<\/sitemap>/gi;
+  const sitemapMatches = xml.matchAll(sitemapPattern);
+  const childSitemaps = Array.from(sitemapMatches, (m) => m[1]);
+  if (childSitemaps.length > 0) {
+    // Recursively parse child sitemaps
+    const childUrls = await Promise.all(
+      childSitemaps.map((url) => parseSitemap(url))
+    );
+    return childUrls.flat();
+  }
+  // Parse regular sitemap
+  const urlPattern = /<url>[\s\S]*?<loc>(.+?)<\/loc>[\s\S]*?<\/url>/gi;
+  const matches = xml.matchAll(urlPattern);
+  for (const match of matches) {
+    urls.push(match[1]);
+  }
+  return urls;
+}
+/**
+ * Fetch content from URL with markdown variant detection
+ * @param {string} urlStr - URL to fetch
+ * @param {boolean} forceExtract - Skip markdown variant detection
+ * @returns {Promise<{content: string, title: string, description: string, status: number, error?: string, fetchCount: number, publishedDate?: string}>}
+ */
+async function fetchUrlContent(urlStr, forceExtract = false) {
+  let title = "";
+  let description = "";
+  let content = "";
+  let error;
+  let status = 0;
+  let fetchCount = 0;
+  let publishedDate = "";
+  if (forceExtract) {
+    // Just fetch HTML for metadata when forcing extract
+    try {
+      const res = await fetch(urlStr, {
+        headers: {
+          Accept: "text/html",
+          "User-Agent": "sitemap-to-llmtext-bot/1.0",
+        },
+      });
+      fetchCount++;
+      status = res.status;
+      if (res.ok) {
+        const html = await res.text();
+        ({ title, description, publishedDate } = extractMetadata(html));
+      }
+    } catch (err) {
+      error = `HTML fetch failed: ${err.message || "Unknown"}`;
+    }
+    return {
+      content,
+      title,
+      description,
+      status,
+      error,
+      fetchCount,
+      publishedDate,
+    };
+  }
+  // First, fetch HTML to check for markdown variants
+  let html = "";
+  try {
+    const htmlRes = await fetch(urlStr, {
+      headers: {
+        Accept: "text/html",
+        "User-Agent": "sitemap-to-llmtext-bot/1.0",
+      },
+    });
+    fetchCount++;
+    status = htmlRes.status;
+    if (htmlRes.ok) {
+      html = await htmlRes.text();
+      ({ title, description, publishedDate } = extractMetadata(html));
+      // Look for markdown alternate link
+      const mdAlternateMatch = html.match(
+        /<link\s+rel=["']alternate["']\s+type=["']text\/markdown["']\s+href=["']([^"']+)["'][^>]*>/i
+      );
+      if (mdAlternateMatch) {
+        const mdUrl = new URL(mdAlternateMatch[1], urlStr).href;
+        try {
+          const mdRes = await fetch(mdUrl, {
+            headers: {
+              Accept: "text/markdown, text/plain",
+              "User-Agent": "sitemap-to-llmtext-bot/1.0",
+            },
+          });
+          fetchCount++;
+          if (mdRes.ok) {
+            content = await mdRes.text();
+            return {
+              content,
+              title,
+              description,
+              status,
+              fetchCount,
+              publishedDate,
+            };
+          }
+        } catch (mdErr) {
+          // Fall through to try direct markdown request
+        }
+      }
+    }
+  } catch (err) {
+    error = `HTML fetch failed: ${err.message || "Unknown"}`;
+  }
+  // Try fetching with markdown accept header
+  try {
+    const mdRes = await fetch(urlStr, {
+      headers: {
+        Accept: "text/markdown",
+        "User-Agent": "sitemap-to-llmtext-bot/1.0",
+      },
+    });
+    fetchCount++;
+    status = status || mdRes.status;
+    const contentType = mdRes.headers.get("content-type") || "";
+    if (mdRes.ok && contentType.includes("markdown")) {
+      content = await mdRes.text();
+    }
+  } catch (mdErr) {
+    if (!error) {
+      error = `Markdown fetch failed: ${mdErr.message || "Unknown"}`;
+    }
+  }
+  return {
+    content,
+    title,
+    description,
+    status,
+    error,
+    fetchCount,
+    publishedDate,
+  };
+}
+/**
+ * Extract metadata from HTML
+ * @param {string} html - HTML content
+ * @returns {{title: string, description: string, publishedDate: string}}
+ */
+function extractMetadata(html) {
+  let title = "";
+  let description = "";
+  let publishedDate = "";
+  // Extract title
+  const titleMatch = html.match(/<title>([^<]+)<\/title>/i);
+  if (titleMatch) {
+    title = titleMatch[1].trim();
+  }
+  // Extract og:description
+  const ogDescMatch = html.match(
+    /<meta\s+property=["']og:description["']\s+content=["']([^"']+)["']/i
+  );
+  if (ogDescMatch) {
+    description = ogDescMatch[1].trim();
+  }
+  // Fallback to meta description
+  if (!description) {
+    const metaDescMatch = html.match(
+      /<meta\s+name=["']description["']\s+content=["']([^"']+)["']/i
+    );
+    if (metaDescMatch) {
+      description = metaDescMatch[1].trim();
+    }
+  }
+  // Extract published date from various meta tags
+  const datePatterns = [
+    /<meta\s+property=["']article:published_time["']\s+content=["']([^"']+)["']/i,
+    /<meta\s+name=["']date["']\s+content=["']([^"']+)["']/i,
+    /<meta\s+name=["']publish-date["']\s+content=["']([^"']+)["']/i,
+  ];
+  for (const pattern of datePatterns) {
+    const match = html.match(pattern);
+    if (match) {
+      publishedDate = match[1].trim();
+      break;
+    }
+  }
+  return { title, description, publishedDate };
+}
+/**
+ * Convert URL to file path
+ * @param {string} urlStr - URL to convert
+ * @returns {string} File path
+ */
+function getPathFromUrl(urlStr) {
+  try {
+    const url = new URL(urlStr);
+    let path = url.pathname;
+    // Handle root path
+    if (path === "/" || path === "") {
+      return "/index.html";
+    }
+    // Handle paths ending with /
+    if (path.endsWith("/")) {
+      path += "index.html";
+    }
+    return path;
+  } catch {
+    // Fallback to a sanitized version of the full URL
+    return "/" + urlStr.replace(/[^a-zA-Z0-9]/g, "_");
+  }
+}
+/**
+ * Generate llms.txt content
+ * @param {string} origin - Site origin
+ * @param {Record<string, any>} files - Files object
+ * @returns {string} Generated llms.txt content
+ */
+function generateLlmsTxt(origin, files) {
+  // Find homepage for top-level description
+  const homepageFile = files["/index.html.md"] || files[Object.keys(files)[0]];
+  const siteTitle =
+    homepageFile?.title ||
+    new URL(origin.startsWith("http") ? origin : `https://${origin}`).hostname;
+  const siteDescription =
+    homepageFile?.description || `Documentation for ${siteTitle}`;
+  let llmsTxt = `# ${siteTitle}\n\n> ${siteDescription}\n\n`;
+  // Add documentation section
+  llmsTxt += "## Documentation\n\n";
+  // Sort files by path for consistent ordering
+  const sortedFiles = Object.entries(files)
+    .filter(([path]) => path !== "/llms.txt")
+    .sort(([a], [b]) => a.localeCompare(b));
+  for (const [path, file] of sortedFiles) {
+    if (file.content || file.title) {
+      const title = file.title || path.replace(".md", "");
+      const description = file.description ? `: ${file.description}` : "";
+      llmsTxt += `- [${title}](${path.replace(".md", "")}) (${
+        file.tokens
+      } tokens)${description}\n`;
+    }
+  }
+  return llmsTxt;
+}
+/**
+ * Call Parallel Extract API for multiple URLs
+ * @param {string[]} urls - URLs to extract
+ * @param {string} apiKey - Parallel API key
+ * @returns {Promise<{results: Array<{url: string, published_date: string, full_content: string|null, title: string|null}>, errors: Array<{url: string, message: string}>}>}
+ */
+async function callParallelExtractAPI(urls, apiKey) {
+  const response = await fetch("https://api.parallel.ai/v1beta/extract", {
+    method: "POST",
+    headers: {
+      "Content-Type": "application/json",
+      "parallel-beta": "search-extract-2025-10-10",
+      "x-api-key": apiKey,
+    },
+    body: JSON.stringify({
+      urls,
+      full_content: true,
+    }),
+  });
+  if (!response.ok) {
+    throw new Error(
+      `Extract API failed: ${response.status} ${response.statusText}`
+    );
+  }
+  return await response.json();
+}

package/package.json ADDED Viewed

@@ -0,0 +1,16 @@
+{
+  "name": "extract-from-sitemap",
+  "bin": "cli.ts",
+  "version": "0.0.1",
+  "main": "mod.js",
+  "description": "A module and CLI that allows extracting all pages from a sitemap into markdown and a llms.txt, using Parallel.ai APIs.",
+  "files": [
+    "mod.js",
+    "cli.ts"
+  ],
+  "license": "MIT",
+  "devDependencies": {
+    "@cloudflare/workers-types": "4.20251011.0",
+    "@types/bun": "1.3.0"
+  }
+}