extract-from-sitemap 0.0.7 ā 0.0.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/cli.js +56 -17
- package/package.json +1 -1
package/cli.js
CHANGED
|
@@ -278,6 +278,9 @@ async function loadConfig() {
|
|
|
278
278
|
if (!Array.isArray(config.sources))
|
|
279
279
|
throw new Error("sources must be an array");
|
|
280
280
|
|
|
281
|
+
// Resolve top-level outDir to absolute path
|
|
282
|
+
const topLevelOutDir = path.resolve(config.outDir);
|
|
283
|
+
|
|
281
284
|
// Validate source objects
|
|
282
285
|
for (const [index, sourceConfig] of config.sources.entries()) {
|
|
283
286
|
if (typeof sourceConfig !== "object" || sourceConfig === null) {
|
|
@@ -292,6 +295,22 @@ async function loadConfig() {
|
|
|
292
295
|
sourceConfig.keepOriginalUrls = sourceConfig.keepOriginalUrls ?? false;
|
|
293
296
|
sourceConfig.customUrls = sourceConfig.customUrls || [];
|
|
294
297
|
|
|
298
|
+
// Default outDir to top-level outDir if not specified
|
|
299
|
+
if (!sourceConfig.outDir) {
|
|
300
|
+
sourceConfig.outDir = config.outDir;
|
|
301
|
+
}
|
|
302
|
+
|
|
303
|
+
// Validate outDir is within top-level outDir (unless keepOriginalUrls is true)
|
|
304
|
+
if (!sourceConfig.keepOriginalUrls) {
|
|
305
|
+
const resolvedSourceOutDir = path.resolve(sourceConfig.outDir);
|
|
306
|
+
|
|
307
|
+
if (!resolvedSourceOutDir.startsWith(topLevelOutDir)) {
|
|
308
|
+
throw new Error(
|
|
309
|
+
`sources[${index}].outDir (${sourceConfig.outDir}) must be within the top-level outDir (${config.outDir})`
|
|
310
|
+
);
|
|
311
|
+
}
|
|
312
|
+
}
|
|
313
|
+
|
|
295
314
|
// Either origin or customUrls must be provided
|
|
296
315
|
if (
|
|
297
316
|
!sourceConfig.origin &&
|
|
@@ -302,13 +321,6 @@ async function loadConfig() {
|
|
|
302
321
|
);
|
|
303
322
|
}
|
|
304
323
|
|
|
305
|
-
// outDir is required unless keepOriginalUrls is true
|
|
306
|
-
if (!sourceConfig.outDir && !sourceConfig.keepOriginalUrls) {
|
|
307
|
-
throw new Error(
|
|
308
|
-
`sources[${index}].outDir is required when keepOriginalUrls is false`
|
|
309
|
-
);
|
|
310
|
-
}
|
|
311
|
-
|
|
312
324
|
// Validate customUrls
|
|
313
325
|
for (const [urlIndex, customUrl] of (
|
|
314
326
|
sourceConfig.customUrls || []
|
|
@@ -514,9 +526,27 @@ async function processCustomUrls(customUrls, apiKey) {
|
|
|
514
526
|
return files;
|
|
515
527
|
}
|
|
516
528
|
|
|
529
|
+
/**
|
|
530
|
+
* Get path prefix for links in llms.txt
|
|
531
|
+
* @param {string} topLevelOutDir - Top-level output directory
|
|
532
|
+
* @param {string} sourceOutDir - Source-specific output directory
|
|
533
|
+
* @returns {string} Path prefix for links
|
|
534
|
+
*/
|
|
535
|
+
function getPathPrefix(topLevelOutDir, sourceOutDir) {
|
|
536
|
+
const resolvedTopLevel = path.resolve(topLevelOutDir);
|
|
537
|
+
const resolvedSource = path.resolve(sourceOutDir);
|
|
538
|
+
|
|
539
|
+
if (resolvedSource === resolvedTopLevel) {
|
|
540
|
+
return "";
|
|
541
|
+
}
|
|
542
|
+
|
|
543
|
+
const relativePath = path.relative(resolvedTopLevel, resolvedSource);
|
|
544
|
+
return relativePath ? relativePath + "/" : "";
|
|
545
|
+
}
|
|
546
|
+
|
|
517
547
|
/**
|
|
518
548
|
* Generate combined llms.txt from all sources
|
|
519
|
-
* @param {Array<{title: string, files: Record<string, any>, keepOriginalUrls?: boolean}>} allSources - All processed sources
|
|
549
|
+
* @param {Array<{title: string, files: Record<string, any>, keepOriginalUrls?: boolean, pathPrefix: string}>} allSources - All processed sources
|
|
520
550
|
* @returns {string} Combined llms.txt content
|
|
521
551
|
*/
|
|
522
552
|
function generateCombinedLlmsTxt(allSources) {
|
|
@@ -536,10 +566,13 @@ function generateCombinedLlmsTxt(allSources) {
|
|
|
536
566
|
const title = file.title || path.replace(".md", "");
|
|
537
567
|
const description = file.description ? `: ${file.description}` : "";
|
|
538
568
|
|
|
539
|
-
//
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
569
|
+
// Generate link based on keepOriginalUrls and pathPrefix
|
|
570
|
+
let link;
|
|
571
|
+
if (source.keepOriginalUrls) {
|
|
572
|
+
link = file.originalUrl;
|
|
573
|
+
} else {
|
|
574
|
+
link = source.pathPrefix + path;
|
|
575
|
+
}
|
|
543
576
|
|
|
544
577
|
combinedTxt += `- [${title}](${link}) (${file.tokens} tokens)${description}\n`;
|
|
545
578
|
}
|
|
@@ -600,13 +633,13 @@ async function main() {
|
|
|
600
633
|
`\nš Processing ${sourceName} (forceExtract: ${sourceConfig.forceExtract}, keepOriginalUrls: ${sourceConfig.keepOriginalUrls})`
|
|
601
634
|
);
|
|
602
635
|
|
|
603
|
-
//
|
|
604
|
-
if (!sourceConfig.keepOriginalUrls
|
|
636
|
+
// Ensure source output directory exists (if not keeping original URLs)
|
|
637
|
+
if (!sourceConfig.keepOriginalUrls) {
|
|
605
638
|
fs.mkdirSync(sourceConfig.outDir, { recursive: true });
|
|
606
639
|
}
|
|
607
640
|
|
|
608
|
-
// Load previous manifest for this source (only if we have an outDir)
|
|
609
|
-
const previousManifest = sourceConfig.
|
|
641
|
+
// Load previous manifest for this source (only if we have an outDir and not keeping original URLs)
|
|
642
|
+
const previousManifest = !sourceConfig.keepOriginalUrls
|
|
610
643
|
? loadManifest(sourceConfig.outDir)
|
|
611
644
|
: { files: [], timestamp: new Date().toISOString() };
|
|
612
645
|
const currentFiles = [];
|
|
@@ -654,7 +687,7 @@ async function main() {
|
|
|
654
687
|
}
|
|
655
688
|
|
|
656
689
|
// Write files to source directory (only if not keeping original URLs)
|
|
657
|
-
if (!sourceConfig.keepOriginalUrls
|
|
690
|
+
if (!sourceConfig.keepOriginalUrls) {
|
|
658
691
|
for (const [filePath, file] of Object.entries(sourceFiles)) {
|
|
659
692
|
let filename = filePath.startsWith("/")
|
|
660
693
|
? filePath.slice(1)
|
|
@@ -695,11 +728,17 @@ async function main() {
|
|
|
695
728
|
);
|
|
696
729
|
}
|
|
697
730
|
|
|
731
|
+
// Calculate path prefix for this source
|
|
732
|
+
const pathPrefix = sourceConfig.keepOriginalUrls
|
|
733
|
+
? ""
|
|
734
|
+
: getPathPrefix(config.outDir, sourceConfig.outDir);
|
|
735
|
+
|
|
698
736
|
// Add to all sources for combined llms.txt
|
|
699
737
|
allSources.push({
|
|
700
738
|
title: sourceConfig.title,
|
|
701
739
|
files: sourceFiles,
|
|
702
740
|
keepOriginalUrls: sourceConfig.keepOriginalUrls,
|
|
741
|
+
pathPrefix: pathPrefix,
|
|
703
742
|
});
|
|
704
743
|
} catch (error) {
|
|
705
744
|
console.error(`ā Error processing ${sourceName}:`, error.message);
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "extract-from-sitemap",
|
|
3
3
|
"bin": "cli.js",
|
|
4
|
-
"version": "0.0.
|
|
4
|
+
"version": "0.0.9",
|
|
5
5
|
"main": "mod.js",
|
|
6
6
|
"description": "A module and CLI that allows extracting all pages from a sitemap into markdown and a llms.txt, using Parallel.ai APIs.",
|
|
7
7
|
"files": [
|