extract-from-sitemap 0.0.7 → 0.0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/cli.js +56 -17
  2. package/package.json +1 -1
package/cli.js CHANGED
@@ -278,6 +278,9 @@ async function loadConfig() {
278
278
  if (!Array.isArray(config.sources))
279
279
  throw new Error("sources must be an array");
280
280
 
281
+ // Resolve top-level outDir to absolute path
282
+ const topLevelOutDir = path.resolve(config.outDir);
283
+
281
284
  // Validate source objects
282
285
  for (const [index, sourceConfig] of config.sources.entries()) {
283
286
  if (typeof sourceConfig !== "object" || sourceConfig === null) {
@@ -292,6 +295,22 @@ async function loadConfig() {
292
295
  sourceConfig.keepOriginalUrls = sourceConfig.keepOriginalUrls ?? false;
293
296
  sourceConfig.customUrls = sourceConfig.customUrls || [];
294
297
 
298
+ // Default outDir to top-level outDir if not specified
299
+ if (!sourceConfig.outDir) {
300
+ sourceConfig.outDir = config.outDir;
301
+ }
302
+
303
+ // Validate outDir is within top-level outDir (unless keepOriginalUrls is true)
304
+ if (!sourceConfig.keepOriginalUrls) {
305
+ const resolvedSourceOutDir = path.resolve(sourceConfig.outDir);
306
+
307
+ if (!resolvedSourceOutDir.startsWith(topLevelOutDir)) {
308
+ throw new Error(
309
+ `sources[${index}].outDir (${sourceConfig.outDir}) must be within the top-level outDir (${config.outDir})`
310
+ );
311
+ }
312
+ }
313
+
295
314
  // Either origin or customUrls must be provided
296
315
  if (
297
316
  !sourceConfig.origin &&
@@ -302,13 +321,6 @@ async function loadConfig() {
302
321
  );
303
322
  }
304
323
 
305
- // outDir is required unless keepOriginalUrls is true
306
- if (!sourceConfig.outDir && !sourceConfig.keepOriginalUrls) {
307
- throw new Error(
308
- `sources[${index}].outDir is required when keepOriginalUrls is false`
309
- );
310
- }
311
-
312
324
  // Validate customUrls
313
325
  for (const [urlIndex, customUrl] of (
314
326
  sourceConfig.customUrls || []
@@ -514,9 +526,27 @@ async function processCustomUrls(customUrls, apiKey) {
514
526
  return files;
515
527
  }
516
528
 
529
+ /**
530
+ * Get path prefix for links in llms.txt
531
+ * @param {string} topLevelOutDir - Top-level output directory
532
+ * @param {string} sourceOutDir - Source-specific output directory
533
+ * @returns {string} Path prefix for links
534
+ */
535
+ function getPathPrefix(topLevelOutDir, sourceOutDir) {
536
+ const resolvedTopLevel = path.resolve(topLevelOutDir);
537
+ const resolvedSource = path.resolve(sourceOutDir);
538
+
539
+ if (resolvedSource === resolvedTopLevel) {
540
+ return "";
541
+ }
542
+
543
+ const relativePath = path.relative(resolvedTopLevel, resolvedSource);
544
+ return relativePath ? relativePath + "/" : "";
545
+ }
546
+
517
547
  /**
518
548
  * Generate combined llms.txt from all sources
519
- * @param {Array<{title: string, files: Record<string, any>, keepOriginalUrls?: boolean}>} allSources - All processed sources
549
+ * @param {Array<{title: string, files: Record<string, any>, keepOriginalUrls?: boolean, pathPrefix: string}>} allSources - All processed sources
520
550
  * @returns {string} Combined llms.txt content
521
551
  */
522
552
  function generateCombinedLlmsTxt(allSources) {
@@ -536,10 +566,13 @@ function generateCombinedLlmsTxt(allSources) {
536
566
  const title = file.title || path.replace(".md", "");
537
567
  const description = file.description ? `: ${file.description}` : "";
538
568
 
539
- // If keepOriginalUrls is true, link to the original URL, otherwise link to the local file
540
- const link = source.keepOriginalUrls
541
- ? file.originalUrl
542
- : path.replace(".md", "");
569
+ // Generate link based on keepOriginalUrls and pathPrefix
570
+ let link;
571
+ if (source.keepOriginalUrls) {
572
+ link = file.originalUrl;
573
+ } else {
574
+ link = source.pathPrefix + path;
575
+ }
543
576
 
544
577
  combinedTxt += `- [${title}](${link}) (${file.tokens} tokens)${description}\n`;
545
578
  }
@@ -600,13 +633,13 @@ async function main() {
600
633
  `\n🌐 Processing ${sourceName} (forceExtract: ${sourceConfig.forceExtract}, keepOriginalUrls: ${sourceConfig.keepOriginalUrls})`
601
634
  );
602
635
 
603
- // Only ensure source output directory exists if not keeping original URLs
604
- if (!sourceConfig.keepOriginalUrls && sourceConfig.outDir) {
636
+ // Ensure source output directory exists (if not keeping original URLs)
637
+ if (!sourceConfig.keepOriginalUrls) {
605
638
  fs.mkdirSync(sourceConfig.outDir, { recursive: true });
606
639
  }
607
640
 
608
- // Load previous manifest for this source (only if we have an outDir)
609
- const previousManifest = sourceConfig.outDir
641
+ // Load previous manifest for this source (only if we have an outDir and not keeping original URLs)
642
+ const previousManifest = !sourceConfig.keepOriginalUrls
610
643
  ? loadManifest(sourceConfig.outDir)
611
644
  : { files: [], timestamp: new Date().toISOString() };
612
645
  const currentFiles = [];
@@ -654,7 +687,7 @@ async function main() {
654
687
  }
655
688
 
656
689
  // Write files to source directory (only if not keeping original URLs)
657
- if (!sourceConfig.keepOriginalUrls && sourceConfig.outDir) {
690
+ if (!sourceConfig.keepOriginalUrls) {
658
691
  for (const [filePath, file] of Object.entries(sourceFiles)) {
659
692
  let filename = filePath.startsWith("/")
660
693
  ? filePath.slice(1)
@@ -695,11 +728,17 @@ async function main() {
695
728
  );
696
729
  }
697
730
 
731
+ // Calculate path prefix for this source
732
+ const pathPrefix = sourceConfig.keepOriginalUrls
733
+ ? ""
734
+ : getPathPrefix(config.outDir, sourceConfig.outDir);
735
+
698
736
  // Add to all sources for combined llms.txt
699
737
  allSources.push({
700
738
  title: sourceConfig.title,
701
739
  files: sourceFiles,
702
740
  keepOriginalUrls: sourceConfig.keepOriginalUrls,
741
+ pathPrefix: pathPrefix,
703
742
  });
704
743
  } catch (error) {
705
744
  console.error(`āŒ Error processing ${sourceName}:`, error.message);
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "extract-from-sitemap",
3
3
  "bin": "cli.js",
4
- "version": "0.0.7",
4
+ "version": "0.0.9",
5
5
  "main": "mod.js",
6
6
  "description": "A module and CLI that allows extracting all pages from a sitemap into markdown and a llms.txt, using Parallel.ai APIs.",
7
7
  "files": [