extract-from-sitemap 0.0.8 → 0.0.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/cli.js +56 -15
  2. package/package.json +1 -1
package/cli.js CHANGED
@@ -278,6 +278,9 @@ async function loadConfig() {
278
278
  if (!Array.isArray(config.sources))
279
279
  throw new Error("sources must be an array");
280
280
 
281
+ // Resolve top-level outDir to absolute path
282
+ const topLevelOutDir = path.resolve(config.outDir);
283
+
281
284
  // Validate source objects
282
285
  for (const [index, sourceConfig] of config.sources.entries()) {
283
286
  if (typeof sourceConfig !== "object" || sourceConfig === null) {
@@ -292,6 +295,22 @@ async function loadConfig() {
292
295
  sourceConfig.keepOriginalUrls = sourceConfig.keepOriginalUrls ?? false;
293
296
  sourceConfig.customUrls = sourceConfig.customUrls || [];
294
297
 
298
+ // Default outDir to top-level outDir if not specified
299
+ if (!sourceConfig.outDir) {
300
+ sourceConfig.outDir = config.outDir;
301
+ }
302
+
303
+ // Validate outDir is within top-level outDir (unless keepOriginalUrls is true)
304
+ if (!sourceConfig.keepOriginalUrls) {
305
+ const resolvedSourceOutDir = path.resolve(sourceConfig.outDir);
306
+
307
+ if (!resolvedSourceOutDir.startsWith(topLevelOutDir)) {
308
+ throw new Error(
309
+ `sources[${index}].outDir (${sourceConfig.outDir}) must be within the top-level outDir (${config.outDir})`
310
+ );
311
+ }
312
+ }
313
+
295
314
  // Either origin or customUrls must be provided
296
315
  if (
297
316
  !sourceConfig.origin &&
@@ -302,13 +321,6 @@ async function loadConfig() {
302
321
  );
303
322
  }
304
323
 
305
- // outDir is required unless keepOriginalUrls is true
306
- if (!sourceConfig.outDir && !sourceConfig.keepOriginalUrls) {
307
- throw new Error(
308
- `sources[${index}].outDir is required when keepOriginalUrls is false`
309
- );
310
- }
311
-
312
324
  // Validate customUrls
313
325
  for (const [urlIndex, customUrl] of (
314
326
  sourceConfig.customUrls || []
@@ -514,9 +526,27 @@ async function processCustomUrls(customUrls, apiKey) {
514
526
  return files;
515
527
  }
516
528
 
529
+ /**
530
+ * Get path prefix for links in llms.txt
531
+ * @param {string} topLevelOutDir - Top-level output directory
532
+ * @param {string} sourceOutDir - Source-specific output directory
533
+ * @returns {string} Path prefix for links
534
+ */
535
+ function getPathPrefix(topLevelOutDir, sourceOutDir) {
536
+ const resolvedTopLevel = path.resolve(topLevelOutDir);
537
+ const resolvedSource = path.resolve(sourceOutDir);
538
+
539
+ if (resolvedSource === resolvedTopLevel) {
540
+ return "";
541
+ }
542
+
543
+ const relativePath = path.relative(resolvedTopLevel, resolvedSource);
544
+ return relativePath || "";
545
+ }
546
+
517
547
  /**
518
548
  * Generate combined llms.txt from all sources
519
- * @param {Array<{title: string, files: Record<string, any>, keepOriginalUrls?: boolean}>} allSources - All processed sources
549
+ * @param {Array<{title: string, files: Record<string, any>, keepOriginalUrls?: boolean, pathPrefix: string}>} allSources - All processed sources
520
550
  * @returns {string} Combined llms.txt content
521
551
  */
522
552
  function generateCombinedLlmsTxt(allSources) {
@@ -536,8 +566,13 @@ function generateCombinedLlmsTxt(allSources) {
536
566
  const title = file.title || path.replace(".md", "");
537
567
  const description = file.description ? `: ${file.description}` : "";
538
568
 
539
- // If keepOriginalUrls is true, link to the original URL, otherwise link to the local file
540
- const link = source.keepOriginalUrls ? file.originalUrl : path;
569
+ // Generate link based on keepOriginalUrls and pathPrefix
570
+ let link;
571
+ if (source.keepOriginalUrls) {
572
+ link = file.originalUrl;
573
+ } else {
574
+ link = source.pathPrefix + (path.startsWith("/") ? path : "/" + path);
575
+ }
541
576
 
542
577
  combinedTxt += `- [${title}](${link}) (${file.tokens} tokens)${description}\n`;
543
578
  }
@@ -598,13 +633,13 @@ async function main() {
598
633
  `\n🌐 Processing ${sourceName} (forceExtract: ${sourceConfig.forceExtract}, keepOriginalUrls: ${sourceConfig.keepOriginalUrls})`
599
634
  );
600
635
 
601
- // Only ensure source output directory exists if not keeping original URLs
602
- if (!sourceConfig.keepOriginalUrls && sourceConfig.outDir) {
636
+ // Ensure source output directory exists (if not keeping original URLs)
637
+ if (!sourceConfig.keepOriginalUrls) {
603
638
  fs.mkdirSync(sourceConfig.outDir, { recursive: true });
604
639
  }
605
640
 
606
- // Load previous manifest for this source (only if we have an outDir)
607
- const previousManifest = sourceConfig.outDir
641
+ // Load previous manifest for this source (only if we have an outDir and not keeping original URLs)
642
+ const previousManifest = !sourceConfig.keepOriginalUrls
608
643
  ? loadManifest(sourceConfig.outDir)
609
644
  : { files: [], timestamp: new Date().toISOString() };
610
645
  const currentFiles = [];
@@ -652,7 +687,7 @@ async function main() {
652
687
  }
653
688
 
654
689
  // Write files to source directory (only if not keeping original URLs)
655
- if (!sourceConfig.keepOriginalUrls && sourceConfig.outDir) {
690
+ if (!sourceConfig.keepOriginalUrls) {
656
691
  for (const [filePath, file] of Object.entries(sourceFiles)) {
657
692
  let filename = filePath.startsWith("/")
658
693
  ? filePath.slice(1)
@@ -693,11 +728,17 @@ async function main() {
693
728
  );
694
729
  }
695
730
 
731
+ // Calculate path prefix for this source
732
+ const pathPrefix = sourceConfig.keepOriginalUrls
733
+ ? ""
734
+ : getPathPrefix(config.outDir, sourceConfig.outDir);
735
+
696
736
  // Add to all sources for combined llms.txt
697
737
  allSources.push({
698
738
  title: sourceConfig.title,
699
739
  files: sourceFiles,
700
740
  keepOriginalUrls: sourceConfig.keepOriginalUrls,
741
+ pathPrefix: pathPrefix,
701
742
  });
702
743
  } catch (error) {
703
744
  console.error(`āŒ Error processing ${sourceName}:`, error.message);
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "extract-from-sitemap",
3
3
  "bin": "cli.js",
4
- "version": "0.0.8",
4
+ "version": "0.0.10",
5
5
  "main": "mod.js",
6
6
  "description": "A module and CLI that allows extracting all pages from a sitemap into markdown and a llms.txt, using Parallel.ai APIs.",
7
7
  "files": [