extract-from-sitemap 0.0.5 ā 0.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/cli.js +193 -91
- package/package.json +1 -1
package/cli.js
CHANGED
|
@@ -10,16 +10,17 @@ const os = require("os");
|
|
|
10
10
|
const { extractFromSitemap } = require("./mod.js");
|
|
11
11
|
|
|
12
12
|
/**
|
|
13
|
-
* @typedef {Object}
|
|
14
|
-
* @property {string} origin - The origin URL to process
|
|
15
|
-
* @property {boolean} forceExtract - Whether to force extraction for this
|
|
13
|
+
* @typedef {Object} SourceConfig
|
|
14
|
+
* @property {string} [origin] - The origin URL to process (optional)
|
|
15
|
+
* @property {boolean} forceExtract - Whether to force extraction for this source
|
|
16
|
+
* @property {string} outDir - Output directory for this source's extracted files
|
|
17
|
+
* @property {Array<{title: string, description: string, url: string}>} [customUrls] - Custom URLs to extract for this source
|
|
16
18
|
*/
|
|
17
19
|
|
|
18
20
|
/**
|
|
19
21
|
* @typedef {Object} Config
|
|
20
|
-
* @property {string} outDir -
|
|
21
|
-
* @property {
|
|
22
|
-
* @property {Array<{title: string, description: string, url: string}>} customUrls - Custom URLs to extract
|
|
22
|
+
* @property {string} outDir - Top-level output directory for combined llms.txt
|
|
23
|
+
* @property {SourceConfig[]} sources - Array of source configurations
|
|
23
24
|
* @property {boolean} keepOriginalUrls - Whether to keep original URL structure
|
|
24
25
|
*/
|
|
25
26
|
|
|
@@ -217,11 +218,29 @@ async function loadConfig() {
|
|
|
217
218
|
{
|
|
218
219
|
$schema: "https://extract.llmtext.com/llmtext.schema.json",
|
|
219
220
|
outDir: "./docs",
|
|
220
|
-
|
|
221
|
-
{
|
|
222
|
-
|
|
221
|
+
sources: [
|
|
222
|
+
{
|
|
223
|
+
origin: "https://docs.parallel.ai",
|
|
224
|
+
forceExtract: false,
|
|
225
|
+
outDir: "./docs/parallel-docs",
|
|
226
|
+
},
|
|
227
|
+
{
|
|
228
|
+
origin: "https://parallel.ai",
|
|
229
|
+
forceExtract: true,
|
|
230
|
+
outDir: "./docs/parallel-main",
|
|
231
|
+
},
|
|
232
|
+
{
|
|
233
|
+
forceExtract: true,
|
|
234
|
+
outDir: "./docs/custom",
|
|
235
|
+
customUrls: [
|
|
236
|
+
{
|
|
237
|
+
title: "Custom Page",
|
|
238
|
+
description: "A custom page to extract",
|
|
239
|
+
url: "https://example.com/page",
|
|
240
|
+
},
|
|
241
|
+
],
|
|
242
|
+
},
|
|
223
243
|
],
|
|
224
|
-
customUrls: [],
|
|
225
244
|
keepOriginalUrls: false,
|
|
226
245
|
},
|
|
227
246
|
null,
|
|
@@ -236,26 +255,39 @@ async function loadConfig() {
|
|
|
236
255
|
|
|
237
256
|
// Validate required fields
|
|
238
257
|
if (!config.outDir) throw new Error("outDir is required");
|
|
239
|
-
if (!Array.isArray(config.
|
|
240
|
-
throw new Error("
|
|
258
|
+
if (!Array.isArray(config.sources))
|
|
259
|
+
throw new Error("sources must be an array");
|
|
241
260
|
|
|
242
|
-
// Validate
|
|
243
|
-
for (const [index,
|
|
244
|
-
if (typeof
|
|
245
|
-
throw new Error(`
|
|
261
|
+
// Validate source objects
|
|
262
|
+
for (const [index, sourceConfig] of config.sources.entries()) {
|
|
263
|
+
if (typeof sourceConfig !== "object" || sourceConfig === null) {
|
|
264
|
+
throw new Error(`sources[${index}] must be an object`);
|
|
246
265
|
}
|
|
247
|
-
if (!
|
|
248
|
-
throw new Error(`
|
|
266
|
+
if (!sourceConfig.outDir) {
|
|
267
|
+
throw new Error(`sources[${index}].outDir is required`);
|
|
249
268
|
}
|
|
250
|
-
if (typeof
|
|
251
|
-
throw new Error(`
|
|
269
|
+
if (typeof sourceConfig.forceExtract !== "boolean") {
|
|
270
|
+
throw new Error(`sources[${index}].forceExtract must be a boolean`);
|
|
271
|
+
}
|
|
272
|
+
// Either origin or customUrls must be provided
|
|
273
|
+
if (
|
|
274
|
+
!sourceConfig.origin &&
|
|
275
|
+
(!sourceConfig.customUrls || sourceConfig.customUrls.length === 0)
|
|
276
|
+
) {
|
|
277
|
+
throw new Error(
|
|
278
|
+
`sources[${index}] must have either origin or customUrls`
|
|
279
|
+
);
|
|
252
280
|
}
|
|
253
281
|
}
|
|
254
282
|
|
|
255
283
|
// Set defaults
|
|
256
|
-
config.customUrls = config.customUrls || [];
|
|
257
284
|
config.keepOriginalUrls = config.keepOriginalUrls ?? false;
|
|
258
285
|
|
|
286
|
+
// Set default customUrls for each source
|
|
287
|
+
for (const sourceConfig of config.sources) {
|
|
288
|
+
sourceConfig.customUrls = sourceConfig.customUrls || [];
|
|
289
|
+
}
|
|
290
|
+
|
|
259
291
|
return config;
|
|
260
292
|
} catch (error) {
|
|
261
293
|
console.error("ā Error reading llmtext.json:", error.message);
|
|
@@ -444,6 +476,47 @@ async function processCustomUrls(customUrls, apiKey) {
|
|
|
444
476
|
return files;
|
|
445
477
|
}
|
|
446
478
|
|
|
479
|
+
/**
|
|
480
|
+
* Generate combined llms.txt from all sources
|
|
481
|
+
* @param {Array<{sourceName: string, files: Record<string, any>, origin?: string}>} allSources - All processed sources
|
|
482
|
+
* @returns {string} Combined llms.txt content
|
|
483
|
+
*/
|
|
484
|
+
function generateCombinedLlmsTxt(allSources) {
|
|
485
|
+
let combinedTxt =
|
|
486
|
+
"# Documentation Collection\n\n> Combined documentation from multiple sources\n\n";
|
|
487
|
+
|
|
488
|
+
for (const source of allSources) {
|
|
489
|
+
const sourceName = source.origin
|
|
490
|
+
? new URL(
|
|
491
|
+
source.origin.startsWith("http")
|
|
492
|
+
? source.origin
|
|
493
|
+
: `https://${source.origin}`
|
|
494
|
+
).hostname
|
|
495
|
+
: source.sourceName;
|
|
496
|
+
|
|
497
|
+
combinedTxt += `## ${sourceName}\n\n`;
|
|
498
|
+
|
|
499
|
+
// Sort files by path for consistent ordering
|
|
500
|
+
const sortedFiles = Object.entries(source.files)
|
|
501
|
+
.filter(([path]) => path !== "/llms.txt")
|
|
502
|
+
.sort(([a], [b]) => a.localeCompare(b));
|
|
503
|
+
|
|
504
|
+
for (const [path, file] of sortedFiles) {
|
|
505
|
+
if (file.content || file.title) {
|
|
506
|
+
const title = file.title || path.replace(".md", "");
|
|
507
|
+
const description = file.description ? `: ${file.description}` : "";
|
|
508
|
+
combinedTxt += `- [${title}](${path.replace(".md", "")}) (${
|
|
509
|
+
file.tokens
|
|
510
|
+
} tokens)${description}\n`;
|
|
511
|
+
}
|
|
512
|
+
}
|
|
513
|
+
|
|
514
|
+
combinedTxt += "\n";
|
|
515
|
+
}
|
|
516
|
+
|
|
517
|
+
return combinedTxt;
|
|
518
|
+
}
|
|
519
|
+
|
|
447
520
|
/**
|
|
448
521
|
* Clear stored API key credentials
|
|
449
522
|
*/
|
|
@@ -477,114 +550,143 @@ async function main() {
|
|
|
477
550
|
const config = await loadConfig();
|
|
478
551
|
const apiKey = await getApiKey();
|
|
479
552
|
|
|
480
|
-
// Ensure output directory exists
|
|
553
|
+
// Ensure top-level output directory exists
|
|
481
554
|
fs.mkdirSync(config.outDir, { recursive: true });
|
|
482
555
|
|
|
483
|
-
|
|
484
|
-
const previousManifest = loadManifest(config.outDir);
|
|
485
|
-
const currentFiles = [];
|
|
486
|
-
|
|
556
|
+
const allSources = [];
|
|
487
557
|
let totalTokens = 0;
|
|
488
558
|
let totalPages = 0;
|
|
489
559
|
let totalErrors = 0;
|
|
490
560
|
|
|
491
|
-
// Process each
|
|
492
|
-
for (const
|
|
561
|
+
// Process each source
|
|
562
|
+
for (const [sourceIndex, sourceConfig] of config.sources.entries()) {
|
|
563
|
+
const sourceName = sourceConfig.origin
|
|
564
|
+
? `source ${sourceIndex + 1} (${sourceConfig.origin})`
|
|
565
|
+
: `source ${sourceIndex + 1} (custom URLs)`;
|
|
566
|
+
|
|
493
567
|
console.log(
|
|
494
|
-
`\nš Processing
|
|
568
|
+
`\nš Processing ${sourceName} (forceExtract: ${sourceConfig.forceExtract})`
|
|
495
569
|
);
|
|
496
570
|
|
|
571
|
+
// Ensure source output directory exists
|
|
572
|
+
fs.mkdirSync(sourceConfig.outDir, { recursive: true });
|
|
573
|
+
|
|
574
|
+
// Load previous manifest for this source
|
|
575
|
+
const previousManifest = loadManifest(sourceConfig.outDir);
|
|
576
|
+
const currentFiles = [];
|
|
577
|
+
let sourceFiles = {};
|
|
578
|
+
|
|
497
579
|
try {
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
580
|
+
// Process origin if provided
|
|
581
|
+
if (sourceConfig.origin) {
|
|
582
|
+
const result = await extractFromSitemap(
|
|
583
|
+
sourceConfig.origin,
|
|
584
|
+
sourceConfig.forceExtract,
|
|
585
|
+
apiKey
|
|
586
|
+
);
|
|
587
|
+
|
|
588
|
+
console.log(
|
|
589
|
+
`ā
Extracted ${result.totalPages} pages with ${result.totalTokens} tokens`
|
|
590
|
+
);
|
|
591
|
+
if (result.errors > 0) {
|
|
592
|
+
console.log(`ā ļø ${result.errors} errors occurred`);
|
|
593
|
+
}
|
|
503
594
|
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
console.log(`ā ļø ${result.errors} errors occurred`);
|
|
595
|
+
sourceFiles = result.files;
|
|
596
|
+
totalTokens += result.totalTokens;
|
|
597
|
+
totalPages += result.totalPages;
|
|
598
|
+
totalErrors += result.errors;
|
|
509
599
|
}
|
|
510
600
|
|
|
511
|
-
//
|
|
512
|
-
|
|
601
|
+
// Process custom URLs for this source
|
|
602
|
+
if (sourceConfig.customUrls && sourceConfig.customUrls.length > 0) {
|
|
603
|
+
console.log(
|
|
604
|
+
`š Processing ${sourceConfig.customUrls.length} custom URLs for this source...`
|
|
605
|
+
);
|
|
606
|
+
const customFiles = await processCustomUrls(
|
|
607
|
+
sourceConfig.customUrls,
|
|
608
|
+
apiKey
|
|
609
|
+
);
|
|
610
|
+
|
|
611
|
+
// Merge custom files with sitemap files
|
|
612
|
+
sourceFiles = { ...sourceFiles, ...customFiles };
|
|
613
|
+
|
|
614
|
+
for (const file of Object.values(customFiles)) {
|
|
615
|
+
totalTokens += file.tokens;
|
|
616
|
+
totalPages++;
|
|
617
|
+
}
|
|
618
|
+
}
|
|
619
|
+
|
|
620
|
+
// Write files to source directory
|
|
621
|
+
for (const [filePath, file] of Object.entries(sourceFiles)) {
|
|
513
622
|
let filename = filePath;
|
|
514
623
|
|
|
515
|
-
if (!config.keepOriginalUrls) {
|
|
516
|
-
//
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
: `https://${originConfig.origin}`
|
|
521
|
-
).hostname;
|
|
522
|
-
const domainDir = path.join(config.outDir, domain);
|
|
523
|
-
fs.mkdirSync(domainDir, { recursive: true });
|
|
524
|
-
filename = path.join(
|
|
525
|
-
domain,
|
|
526
|
-
filePath.startsWith("/") ? filePath.slice(1) : filePath
|
|
527
|
-
);
|
|
528
|
-
} else {
|
|
624
|
+
if (!config.keepOriginalUrls && sourceConfig.origin) {
|
|
625
|
+
// Use relative path within source directory
|
|
626
|
+
filename = filePath.startsWith("/") ? filePath.slice(1) : filePath;
|
|
627
|
+
} else if (!sourceConfig.origin) {
|
|
628
|
+
// For custom URL sources, use simple filename
|
|
529
629
|
filename = filePath.startsWith("/") ? filePath.slice(1) : filePath;
|
|
530
630
|
}
|
|
531
631
|
|
|
532
|
-
const fullFilePath = path.join(
|
|
632
|
+
const fullFilePath = path.join(sourceConfig.outDir, filename);
|
|
533
633
|
const fileDir = path.dirname(fullFilePath);
|
|
534
634
|
|
|
535
635
|
fs.mkdirSync(fileDir, { recursive: true });
|
|
536
636
|
fs.writeFileSync(fullFilePath, file.content);
|
|
537
637
|
currentFiles.push(filename);
|
|
538
638
|
|
|
539
|
-
console.log(
|
|
639
|
+
console.log(
|
|
640
|
+
`š Wrote: ${path.join(sourceConfig.outDir, filename)} (${
|
|
641
|
+
file.tokens
|
|
642
|
+
} tokens)`
|
|
643
|
+
);
|
|
644
|
+
}
|
|
645
|
+
|
|
646
|
+
// Clean up old files for this source
|
|
647
|
+
if (previousManifest.files.length > 0) {
|
|
648
|
+
cleanupOldFiles(
|
|
649
|
+
sourceConfig.outDir,
|
|
650
|
+
currentFiles,
|
|
651
|
+
previousManifest.files
|
|
652
|
+
);
|
|
540
653
|
}
|
|
541
654
|
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
655
|
+
// Save manifest for this source
|
|
656
|
+
const newManifest = {
|
|
657
|
+
files: currentFiles,
|
|
658
|
+
timestamp: new Date().toISOString(),
|
|
659
|
+
};
|
|
660
|
+
saveManifest(sourceConfig.outDir, newManifest);
|
|
661
|
+
|
|
662
|
+
// Add to all sources for combined llms.txt
|
|
663
|
+
allSources.push({
|
|
664
|
+
sourceName: `Source ${sourceIndex + 1}`,
|
|
665
|
+
origin: sourceConfig.origin,
|
|
666
|
+
files: sourceFiles,
|
|
667
|
+
});
|
|
545
668
|
} catch (error) {
|
|
546
|
-
console.error(
|
|
547
|
-
`ā Error processing ${originConfig.origin}:`,
|
|
548
|
-
error.message
|
|
549
|
-
);
|
|
669
|
+
console.error(`ā Error processing ${sourceName}:`, error.message);
|
|
550
670
|
totalErrors++;
|
|
551
671
|
}
|
|
552
672
|
}
|
|
553
673
|
|
|
554
|
-
//
|
|
555
|
-
if (
|
|
556
|
-
|
|
557
|
-
const
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
const filePath = path.join(config.outDir, filename);
|
|
561
|
-
fs.writeFileSync(filePath, file.content);
|
|
562
|
-
currentFiles.push(filename);
|
|
563
|
-
totalTokens += file.tokens;
|
|
564
|
-
totalPages++;
|
|
565
|
-
|
|
566
|
-
console.log(`š Wrote: ${filename} (${file.tokens} tokens)`);
|
|
567
|
-
}
|
|
568
|
-
}
|
|
569
|
-
|
|
570
|
-
// Clean up old files
|
|
571
|
-
if (previousManifest.files.length > 0) {
|
|
572
|
-
cleanupOldFiles(config.outDir, currentFiles, previousManifest.files);
|
|
674
|
+
// Generate and write combined llms.txt to top-level outDir
|
|
675
|
+
if (allSources.length > 0) {
|
|
676
|
+
const combinedLlmsTxt = generateCombinedLlmsTxt(allSources);
|
|
677
|
+
const combinedLlmsTxtPath = path.join(config.outDir, "llms.txt");
|
|
678
|
+
fs.writeFileSync(combinedLlmsTxtPath, combinedLlmsTxt);
|
|
679
|
+
console.log(`\nš Generated combined llms.txt: ${combinedLlmsTxtPath}`);
|
|
573
680
|
}
|
|
574
681
|
|
|
575
|
-
// Save new manifest
|
|
576
|
-
const newManifest = {
|
|
577
|
-
files: currentFiles,
|
|
578
|
-
timestamp: new Date().toISOString(),
|
|
579
|
-
};
|
|
580
|
-
saveManifest(config.outDir, newManifest);
|
|
581
|
-
|
|
582
682
|
console.log("\n⨠Extraction completed!");
|
|
583
683
|
console.log(`š Total: ${totalPages} pages, ${totalTokens} tokens`);
|
|
584
684
|
if (totalErrors > 0) {
|
|
585
685
|
console.log(`ā ļø Errors: ${totalErrors}`);
|
|
586
686
|
}
|
|
587
|
-
console.log(
|
|
687
|
+
console.log(
|
|
688
|
+
`š Top-level output directory: ${path.resolve(config.outDir)}`
|
|
689
|
+
);
|
|
588
690
|
console.log("\nš” Use --clear-credentials to remove stored API key");
|
|
589
691
|
} catch (error) {
|
|
590
692
|
console.error("š„ Fatal error:", error.message);
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "extract-from-sitemap",
|
|
3
3
|
"bin": "cli.js",
|
|
4
|
-
"version": "0.0.
|
|
4
|
+
"version": "0.0.6",
|
|
5
5
|
"main": "mod.js",
|
|
6
6
|
"description": "A module and CLI that allows extracting all pages from a sitemap into markdown and a llms.txt, using Parallel.ai APIs.",
|
|
7
7
|
"files": [
|