extract-from-sitemap 0.0.8 ā 0.0.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/cli.js +56 -15
- package/package.json +1 -1
package/cli.js
CHANGED
|
@@ -278,6 +278,9 @@ async function loadConfig() {
|
|
|
278
278
|
if (!Array.isArray(config.sources))
|
|
279
279
|
throw new Error("sources must be an array");
|
|
280
280
|
|
|
281
|
+
// Resolve top-level outDir to absolute path
|
|
282
|
+
const topLevelOutDir = path.resolve(config.outDir);
|
|
283
|
+
|
|
281
284
|
// Validate source objects
|
|
282
285
|
for (const [index, sourceConfig] of config.sources.entries()) {
|
|
283
286
|
if (typeof sourceConfig !== "object" || sourceConfig === null) {
|
|
@@ -292,6 +295,22 @@ async function loadConfig() {
|
|
|
292
295
|
sourceConfig.keepOriginalUrls = sourceConfig.keepOriginalUrls ?? false;
|
|
293
296
|
sourceConfig.customUrls = sourceConfig.customUrls || [];
|
|
294
297
|
|
|
298
|
+
// Default outDir to top-level outDir if not specified
|
|
299
|
+
if (!sourceConfig.outDir) {
|
|
300
|
+
sourceConfig.outDir = config.outDir;
|
|
301
|
+
}
|
|
302
|
+
|
|
303
|
+
// Validate outDir is within top-level outDir (unless keepOriginalUrls is true)
|
|
304
|
+
if (!sourceConfig.keepOriginalUrls) {
|
|
305
|
+
const resolvedSourceOutDir = path.resolve(sourceConfig.outDir);
|
|
306
|
+
|
|
307
|
+
if (!resolvedSourceOutDir.startsWith(topLevelOutDir)) {
|
|
308
|
+
throw new Error(
|
|
309
|
+
`sources[${index}].outDir (${sourceConfig.outDir}) must be within the top-level outDir (${config.outDir})`
|
|
310
|
+
);
|
|
311
|
+
}
|
|
312
|
+
}
|
|
313
|
+
|
|
295
314
|
// Either origin or customUrls must be provided
|
|
296
315
|
if (
|
|
297
316
|
!sourceConfig.origin &&
|
|
@@ -302,13 +321,6 @@ async function loadConfig() {
|
|
|
302
321
|
);
|
|
303
322
|
}
|
|
304
323
|
|
|
305
|
-
// outDir is required unless keepOriginalUrls is true
|
|
306
|
-
if (!sourceConfig.outDir && !sourceConfig.keepOriginalUrls) {
|
|
307
|
-
throw new Error(
|
|
308
|
-
`sources[${index}].outDir is required when keepOriginalUrls is false`
|
|
309
|
-
);
|
|
310
|
-
}
|
|
311
|
-
|
|
312
324
|
// Validate customUrls
|
|
313
325
|
for (const [urlIndex, customUrl] of (
|
|
314
326
|
sourceConfig.customUrls || []
|
|
@@ -514,9 +526,27 @@ async function processCustomUrls(customUrls, apiKey) {
|
|
|
514
526
|
return files;
|
|
515
527
|
}
|
|
516
528
|
|
|
529
|
+
/**
|
|
530
|
+
* Get path prefix for links in llms.txt
|
|
531
|
+
* @param {string} topLevelOutDir - Top-level output directory
|
|
532
|
+
* @param {string} sourceOutDir - Source-specific output directory
|
|
533
|
+
* @returns {string} Path prefix for links
|
|
534
|
+
*/
|
|
535
|
+
function getPathPrefix(topLevelOutDir, sourceOutDir) {
|
|
536
|
+
const resolvedTopLevel = path.resolve(topLevelOutDir);
|
|
537
|
+
const resolvedSource = path.resolve(sourceOutDir);
|
|
538
|
+
|
|
539
|
+
if (resolvedSource === resolvedTopLevel) {
|
|
540
|
+
return "";
|
|
541
|
+
}
|
|
542
|
+
|
|
543
|
+
const relativePath = path.relative(resolvedTopLevel, resolvedSource);
|
|
544
|
+
return relativePath || "";
|
|
545
|
+
}
|
|
546
|
+
|
|
517
547
|
/**
|
|
518
548
|
* Generate combined llms.txt from all sources
|
|
519
|
-
* @param {Array<{title: string, files: Record<string, any>, keepOriginalUrls?: boolean}>} allSources - All processed sources
|
|
549
|
+
* @param {Array<{title: string, files: Record<string, any>, keepOriginalUrls?: boolean, pathPrefix: string}>} allSources - All processed sources
|
|
520
550
|
* @returns {string} Combined llms.txt content
|
|
521
551
|
*/
|
|
522
552
|
function generateCombinedLlmsTxt(allSources) {
|
|
@@ -536,8 +566,13 @@ function generateCombinedLlmsTxt(allSources) {
|
|
|
536
566
|
const title = file.title || path.replace(".md", "");
|
|
537
567
|
const description = file.description ? `: ${file.description}` : "";
|
|
538
568
|
|
|
539
|
-
//
|
|
540
|
-
|
|
569
|
+
// Generate link based on keepOriginalUrls and pathPrefix
|
|
570
|
+
let link;
|
|
571
|
+
if (source.keepOriginalUrls) {
|
|
572
|
+
link = file.originalUrl;
|
|
573
|
+
} else {
|
|
574
|
+
link = source.pathPrefix + (path.startsWith("/") ? path : "/" + path);
|
|
575
|
+
}
|
|
541
576
|
|
|
542
577
|
combinedTxt += `- [${title}](${link}) (${file.tokens} tokens)${description}\n`;
|
|
543
578
|
}
|
|
@@ -598,13 +633,13 @@ async function main() {
|
|
|
598
633
|
`\nš Processing ${sourceName} (forceExtract: ${sourceConfig.forceExtract}, keepOriginalUrls: ${sourceConfig.keepOriginalUrls})`
|
|
599
634
|
);
|
|
600
635
|
|
|
601
|
-
//
|
|
602
|
-
if (!sourceConfig.keepOriginalUrls
|
|
636
|
+
// Ensure source output directory exists (if not keeping original URLs)
|
|
637
|
+
if (!sourceConfig.keepOriginalUrls) {
|
|
603
638
|
fs.mkdirSync(sourceConfig.outDir, { recursive: true });
|
|
604
639
|
}
|
|
605
640
|
|
|
606
|
-
// Load previous manifest for this source (only if we have an outDir)
|
|
607
|
-
const previousManifest = sourceConfig.
|
|
641
|
+
// Load previous manifest for this source (only if we have an outDir and not keeping original URLs)
|
|
642
|
+
const previousManifest = !sourceConfig.keepOriginalUrls
|
|
608
643
|
? loadManifest(sourceConfig.outDir)
|
|
609
644
|
: { files: [], timestamp: new Date().toISOString() };
|
|
610
645
|
const currentFiles = [];
|
|
@@ -652,7 +687,7 @@ async function main() {
|
|
|
652
687
|
}
|
|
653
688
|
|
|
654
689
|
// Write files to source directory (only if not keeping original URLs)
|
|
655
|
-
if (!sourceConfig.keepOriginalUrls
|
|
690
|
+
if (!sourceConfig.keepOriginalUrls) {
|
|
656
691
|
for (const [filePath, file] of Object.entries(sourceFiles)) {
|
|
657
692
|
let filename = filePath.startsWith("/")
|
|
658
693
|
? filePath.slice(1)
|
|
@@ -693,11 +728,17 @@ async function main() {
|
|
|
693
728
|
);
|
|
694
729
|
}
|
|
695
730
|
|
|
731
|
+
// Calculate path prefix for this source
|
|
732
|
+
const pathPrefix = sourceConfig.keepOriginalUrls
|
|
733
|
+
? ""
|
|
734
|
+
: getPathPrefix(config.outDir, sourceConfig.outDir);
|
|
735
|
+
|
|
696
736
|
// Add to all sources for combined llms.txt
|
|
697
737
|
allSources.push({
|
|
698
738
|
title: sourceConfig.title,
|
|
699
739
|
files: sourceFiles,
|
|
700
740
|
keepOriginalUrls: sourceConfig.keepOriginalUrls,
|
|
741
|
+
pathPrefix: pathPrefix,
|
|
701
742
|
});
|
|
702
743
|
} catch (error) {
|
|
703
744
|
console.error(`ā Error processing ${sourceName}:`, error.message);
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "extract-from-sitemap",
|
|
3
3
|
"bin": "cli.js",
|
|
4
|
-
"version": "0.0.
|
|
4
|
+
"version": "0.0.10",
|
|
5
5
|
"main": "mod.js",
|
|
6
6
|
"description": "A module and CLI that allows extracting all pages from a sitemap into markdown and a llms.txt, using Parallel.ai APIs.",
|
|
7
7
|
"files": [
|