extract-from-sitemap 0.0.4 ā 0.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/cli.js +207 -82
- package/package.json +1 -1
package/cli.js
CHANGED
|
@@ -8,13 +8,20 @@ const http = require("http");
|
|
|
8
8
|
const { URL, URLSearchParams } = require("url");
|
|
9
9
|
const os = require("os");
|
|
10
10
|
const { extractFromSitemap } = require("./mod.js");
|
|
11
|
+
|
|
12
|
+
/**
|
|
13
|
+
* @typedef {Object} SourceConfig
|
|
14
|
+
* @property {string} [origin] - The origin URL to process (optional)
|
|
15
|
+
* @property {boolean} forceExtract - Whether to force extraction for this source
|
|
16
|
+
* @property {string} outDir - Output directory for this source's extracted files
|
|
17
|
+
* @property {Array<{title: string, description: string, url: string}>} [customUrls] - Custom URLs to extract for this source
|
|
18
|
+
*/
|
|
19
|
+
|
|
11
20
|
/**
|
|
12
21
|
* @typedef {Object} Config
|
|
13
|
-
* @property {string} outDir -
|
|
14
|
-
* @property {
|
|
15
|
-
* @property {Array<{title: string, description: string, url: string}>} customUrls - Custom URLs to extract
|
|
22
|
+
* @property {string} outDir - Top-level output directory for combined llms.txt
|
|
23
|
+
* @property {SourceConfig[]} sources - Array of source configurations
|
|
16
24
|
* @property {boolean} keepOriginalUrls - Whether to keep original URL structure
|
|
17
|
-
* @property {boolean} forceExtract - Whether to force extraction even if files exist
|
|
18
25
|
*/
|
|
19
26
|
|
|
20
27
|
/**
|
|
@@ -209,11 +216,32 @@ async function loadConfig() {
|
|
|
209
216
|
console.log(
|
|
210
217
|
JSON.stringify(
|
|
211
218
|
{
|
|
219
|
+
$schema: "https://extract.llmtext.com/llmtext.schema.json",
|
|
212
220
|
outDir: "./docs",
|
|
213
|
-
|
|
214
|
-
|
|
221
|
+
sources: [
|
|
222
|
+
{
|
|
223
|
+
origin: "https://docs.parallel.ai",
|
|
224
|
+
forceExtract: false,
|
|
225
|
+
outDir: "./docs/parallel-docs",
|
|
226
|
+
},
|
|
227
|
+
{
|
|
228
|
+
origin: "https://parallel.ai",
|
|
229
|
+
forceExtract: true,
|
|
230
|
+
outDir: "./docs/parallel-main",
|
|
231
|
+
},
|
|
232
|
+
{
|
|
233
|
+
forceExtract: true,
|
|
234
|
+
outDir: "./docs/custom",
|
|
235
|
+
customUrls: [
|
|
236
|
+
{
|
|
237
|
+
title: "Custom Page",
|
|
238
|
+
description: "A custom page to extract",
|
|
239
|
+
url: "https://example.com/page",
|
|
240
|
+
},
|
|
241
|
+
],
|
|
242
|
+
},
|
|
243
|
+
],
|
|
215
244
|
keepOriginalUrls: false,
|
|
216
|
-
forceExtract: false,
|
|
217
245
|
},
|
|
218
246
|
null,
|
|
219
247
|
2
|
|
@@ -227,13 +255,38 @@ async function loadConfig() {
|
|
|
227
255
|
|
|
228
256
|
// Validate required fields
|
|
229
257
|
if (!config.outDir) throw new Error("outDir is required");
|
|
230
|
-
if (!Array.isArray(config.
|
|
231
|
-
throw new Error("
|
|
258
|
+
if (!Array.isArray(config.sources))
|
|
259
|
+
throw new Error("sources must be an array");
|
|
260
|
+
|
|
261
|
+
// Validate source objects
|
|
262
|
+
for (const [index, sourceConfig] of config.sources.entries()) {
|
|
263
|
+
if (typeof sourceConfig !== "object" || sourceConfig === null) {
|
|
264
|
+
throw new Error(`sources[${index}] must be an object`);
|
|
265
|
+
}
|
|
266
|
+
if (!sourceConfig.outDir) {
|
|
267
|
+
throw new Error(`sources[${index}].outDir is required`);
|
|
268
|
+
}
|
|
269
|
+
if (typeof sourceConfig.forceExtract !== "boolean") {
|
|
270
|
+
throw new Error(`sources[${index}].forceExtract must be a boolean`);
|
|
271
|
+
}
|
|
272
|
+
// Either origin or customUrls must be provided
|
|
273
|
+
if (
|
|
274
|
+
!sourceConfig.origin &&
|
|
275
|
+
(!sourceConfig.customUrls || sourceConfig.customUrls.length === 0)
|
|
276
|
+
) {
|
|
277
|
+
throw new Error(
|
|
278
|
+
`sources[${index}] must have either origin or customUrls`
|
|
279
|
+
);
|
|
280
|
+
}
|
|
281
|
+
}
|
|
232
282
|
|
|
233
283
|
// Set defaults
|
|
234
|
-
config.customUrls = config.customUrls || [];
|
|
235
284
|
config.keepOriginalUrls = config.keepOriginalUrls ?? false;
|
|
236
|
-
|
|
285
|
+
|
|
286
|
+
// Set default customUrls for each source
|
|
287
|
+
for (const sourceConfig of config.sources) {
|
|
288
|
+
sourceConfig.customUrls = sourceConfig.customUrls || [];
|
|
289
|
+
}
|
|
237
290
|
|
|
238
291
|
return config;
|
|
239
292
|
} catch (error) {
|
|
@@ -370,10 +423,9 @@ function cleanupOldFiles(outDir, currentFiles, previousFiles) {
|
|
|
370
423
|
* Process custom URLs through extraction API
|
|
371
424
|
* @param {Array<{title: string, description: string, url: string}>} customUrls - Custom URLs to process
|
|
372
425
|
* @param {string} apiKey - API key for authentication
|
|
373
|
-
* @param {boolean} forceExtract - Whether to force extraction
|
|
374
426
|
* @returns {Promise<Record<string, any>>} Extracted files
|
|
375
427
|
*/
|
|
376
|
-
async function processCustomUrls(customUrls, apiKey
|
|
428
|
+
async function processCustomUrls(customUrls, apiKey) {
|
|
377
429
|
const files = {};
|
|
378
430
|
|
|
379
431
|
for (const customUrl of customUrls) {
|
|
@@ -424,6 +476,47 @@ async function processCustomUrls(customUrls, apiKey, forceExtract) {
|
|
|
424
476
|
return files;
|
|
425
477
|
}
|
|
426
478
|
|
|
479
|
+
/**
|
|
480
|
+
* Generate combined llms.txt from all sources
|
|
481
|
+
* @param {Array<{sourceName: string, files: Record<string, any>, origin?: string}>} allSources - All processed sources
|
|
482
|
+
* @returns {string} Combined llms.txt content
|
|
483
|
+
*/
|
|
484
|
+
function generateCombinedLlmsTxt(allSources) {
|
|
485
|
+
let combinedTxt =
|
|
486
|
+
"# Documentation Collection\n\n> Combined documentation from multiple sources\n\n";
|
|
487
|
+
|
|
488
|
+
for (const source of allSources) {
|
|
489
|
+
const sourceName = source.origin
|
|
490
|
+
? new URL(
|
|
491
|
+
source.origin.startsWith("http")
|
|
492
|
+
? source.origin
|
|
493
|
+
: `https://${source.origin}`
|
|
494
|
+
).hostname
|
|
495
|
+
: source.sourceName;
|
|
496
|
+
|
|
497
|
+
combinedTxt += `## ${sourceName}\n\n`;
|
|
498
|
+
|
|
499
|
+
// Sort files by path for consistent ordering
|
|
500
|
+
const sortedFiles = Object.entries(source.files)
|
|
501
|
+
.filter(([path]) => path !== "/llms.txt")
|
|
502
|
+
.sort(([a], [b]) => a.localeCompare(b));
|
|
503
|
+
|
|
504
|
+
for (const [path, file] of sortedFiles) {
|
|
505
|
+
if (file.content || file.title) {
|
|
506
|
+
const title = file.title || path.replace(".md", "");
|
|
507
|
+
const description = file.description ? `: ${file.description}` : "";
|
|
508
|
+
combinedTxt += `- [${title}](${path.replace(".md", "")}) (${
|
|
509
|
+
file.tokens
|
|
510
|
+
} tokens)${description}\n`;
|
|
511
|
+
}
|
|
512
|
+
}
|
|
513
|
+
|
|
514
|
+
combinedTxt += "\n";
|
|
515
|
+
}
|
|
516
|
+
|
|
517
|
+
return combinedTxt;
|
|
518
|
+
}
|
|
519
|
+
|
|
427
520
|
/**
|
|
428
521
|
* Clear stored API key credentials
|
|
429
522
|
*/
|
|
@@ -457,111 +550,143 @@ async function main() {
|
|
|
457
550
|
const config = await loadConfig();
|
|
458
551
|
const apiKey = await getApiKey();
|
|
459
552
|
|
|
460
|
-
// Ensure output directory exists
|
|
553
|
+
// Ensure top-level output directory exists
|
|
461
554
|
fs.mkdirSync(config.outDir, { recursive: true });
|
|
462
555
|
|
|
463
|
-
|
|
464
|
-
const previousManifest = loadManifest(config.outDir);
|
|
465
|
-
const currentFiles = [];
|
|
466
|
-
|
|
556
|
+
const allSources = [];
|
|
467
557
|
let totalTokens = 0;
|
|
468
558
|
let totalPages = 0;
|
|
469
559
|
let totalErrors = 0;
|
|
470
560
|
|
|
471
|
-
// Process each
|
|
472
|
-
for (const
|
|
473
|
-
|
|
561
|
+
// Process each source
|
|
562
|
+
for (const [sourceIndex, sourceConfig] of config.sources.entries()) {
|
|
563
|
+
const sourceName = sourceConfig.origin
|
|
564
|
+
? `source ${sourceIndex + 1} (${sourceConfig.origin})`
|
|
565
|
+
: `source ${sourceIndex + 1} (custom URLs)`;
|
|
566
|
+
|
|
567
|
+
console.log(
|
|
568
|
+
`\nš Processing ${sourceName} (forceExtract: ${sourceConfig.forceExtract})`
|
|
569
|
+
);
|
|
570
|
+
|
|
571
|
+
// Ensure source output directory exists
|
|
572
|
+
fs.mkdirSync(sourceConfig.outDir, { recursive: true });
|
|
573
|
+
|
|
574
|
+
// Load previous manifest for this source
|
|
575
|
+
const previousManifest = loadManifest(sourceConfig.outDir);
|
|
576
|
+
const currentFiles = [];
|
|
577
|
+
let sourceFiles = {};
|
|
474
578
|
|
|
475
579
|
try {
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
580
|
+
// Process origin if provided
|
|
581
|
+
if (sourceConfig.origin) {
|
|
582
|
+
const result = await extractFromSitemap(
|
|
583
|
+
sourceConfig.origin,
|
|
584
|
+
sourceConfig.forceExtract,
|
|
585
|
+
apiKey
|
|
586
|
+
);
|
|
587
|
+
|
|
588
|
+
console.log(
|
|
589
|
+
`ā
Extracted ${result.totalPages} pages with ${result.totalTokens} tokens`
|
|
590
|
+
);
|
|
591
|
+
if (result.errors > 0) {
|
|
592
|
+
console.log(`ā ļø ${result.errors} errors occurred`);
|
|
593
|
+
}
|
|
481
594
|
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
console.log(`ā ļø ${result.errors} errors occurred`);
|
|
595
|
+
sourceFiles = result.files;
|
|
596
|
+
totalTokens += result.totalTokens;
|
|
597
|
+
totalPages += result.totalPages;
|
|
598
|
+
totalErrors += result.errors;
|
|
487
599
|
}
|
|
488
600
|
|
|
489
|
-
//
|
|
490
|
-
|
|
601
|
+
// Process custom URLs for this source
|
|
602
|
+
if (sourceConfig.customUrls && sourceConfig.customUrls.length > 0) {
|
|
603
|
+
console.log(
|
|
604
|
+
`š Processing ${sourceConfig.customUrls.length} custom URLs for this source...`
|
|
605
|
+
);
|
|
606
|
+
const customFiles = await processCustomUrls(
|
|
607
|
+
sourceConfig.customUrls,
|
|
608
|
+
apiKey
|
|
609
|
+
);
|
|
610
|
+
|
|
611
|
+
// Merge custom files with sitemap files
|
|
612
|
+
sourceFiles = { ...sourceFiles, ...customFiles };
|
|
613
|
+
|
|
614
|
+
for (const file of Object.values(customFiles)) {
|
|
615
|
+
totalTokens += file.tokens;
|
|
616
|
+
totalPages++;
|
|
617
|
+
}
|
|
618
|
+
}
|
|
619
|
+
|
|
620
|
+
// Write files to source directory
|
|
621
|
+
for (const [filePath, file] of Object.entries(sourceFiles)) {
|
|
491
622
|
let filename = filePath;
|
|
492
623
|
|
|
493
|
-
if (!config.keepOriginalUrls) {
|
|
494
|
-
//
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
const domainDir = path.join(config.outDir, domain);
|
|
499
|
-
fs.mkdirSync(domainDir, { recursive: true });
|
|
500
|
-
filename = path.join(
|
|
501
|
-
domain,
|
|
502
|
-
filePath.startsWith("/") ? filePath.slice(1) : filePath
|
|
503
|
-
);
|
|
504
|
-
} else {
|
|
624
|
+
if (!config.keepOriginalUrls && sourceConfig.origin) {
|
|
625
|
+
// Use relative path within source directory
|
|
626
|
+
filename = filePath.startsWith("/") ? filePath.slice(1) : filePath;
|
|
627
|
+
} else if (!sourceConfig.origin) {
|
|
628
|
+
// For custom URL sources, use simple filename
|
|
505
629
|
filename = filePath.startsWith("/") ? filePath.slice(1) : filePath;
|
|
506
630
|
}
|
|
507
631
|
|
|
508
|
-
const fullFilePath = path.join(
|
|
632
|
+
const fullFilePath = path.join(sourceConfig.outDir, filename);
|
|
509
633
|
const fileDir = path.dirname(fullFilePath);
|
|
510
634
|
|
|
511
635
|
fs.mkdirSync(fileDir, { recursive: true });
|
|
512
636
|
fs.writeFileSync(fullFilePath, file.content);
|
|
513
637
|
currentFiles.push(filename);
|
|
514
638
|
|
|
515
|
-
console.log(
|
|
639
|
+
console.log(
|
|
640
|
+
`š Wrote: ${path.join(sourceConfig.outDir, filename)} (${
|
|
641
|
+
file.tokens
|
|
642
|
+
} tokens)`
|
|
643
|
+
);
|
|
516
644
|
}
|
|
517
645
|
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
646
|
+
// Clean up old files for this source
|
|
647
|
+
if (previousManifest.files.length > 0) {
|
|
648
|
+
cleanupOldFiles(
|
|
649
|
+
sourceConfig.outDir,
|
|
650
|
+
currentFiles,
|
|
651
|
+
previousManifest.files
|
|
652
|
+
);
|
|
653
|
+
}
|
|
654
|
+
|
|
655
|
+
// Save manifest for this source
|
|
656
|
+
const newManifest = {
|
|
657
|
+
files: currentFiles,
|
|
658
|
+
timestamp: new Date().toISOString(),
|
|
659
|
+
};
|
|
660
|
+
saveManifest(sourceConfig.outDir, newManifest);
|
|
661
|
+
|
|
662
|
+
// Add to all sources for combined llms.txt
|
|
663
|
+
allSources.push({
|
|
664
|
+
sourceName: `Source ${sourceIndex + 1}`,
|
|
665
|
+
origin: sourceConfig.origin,
|
|
666
|
+
files: sourceFiles,
|
|
667
|
+
});
|
|
521
668
|
} catch (error) {
|
|
522
|
-
console.error(`ā Error processing ${
|
|
669
|
+
console.error(`ā Error processing ${sourceName}:`, error.message);
|
|
523
670
|
totalErrors++;
|
|
524
671
|
}
|
|
525
672
|
}
|
|
526
673
|
|
|
527
|
-
//
|
|
528
|
-
if (
|
|
529
|
-
|
|
530
|
-
const
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
config.forceExtract
|
|
534
|
-
);
|
|
535
|
-
|
|
536
|
-
for (const [filename, file] of Object.entries(customFiles)) {
|
|
537
|
-
const filePath = path.join(config.outDir, filename);
|
|
538
|
-
fs.writeFileSync(filePath, file.content);
|
|
539
|
-
currentFiles.push(filename);
|
|
540
|
-
totalTokens += file.tokens;
|
|
541
|
-
totalPages++;
|
|
542
|
-
|
|
543
|
-
console.log(`š Wrote: ${filename} (${file.tokens} tokens)`);
|
|
544
|
-
}
|
|
674
|
+
// Generate and write combined llms.txt to top-level outDir
|
|
675
|
+
if (allSources.length > 0) {
|
|
676
|
+
const combinedLlmsTxt = generateCombinedLlmsTxt(allSources);
|
|
677
|
+
const combinedLlmsTxtPath = path.join(config.outDir, "llms.txt");
|
|
678
|
+
fs.writeFileSync(combinedLlmsTxtPath, combinedLlmsTxt);
|
|
679
|
+
console.log(`\nš Generated combined llms.txt: ${combinedLlmsTxtPath}`);
|
|
545
680
|
}
|
|
546
681
|
|
|
547
|
-
// Clean up old files
|
|
548
|
-
if (previousManifest.files.length > 0) {
|
|
549
|
-
cleanupOldFiles(config.outDir, currentFiles, previousManifest.files);
|
|
550
|
-
}
|
|
551
|
-
|
|
552
|
-
// Save new manifest
|
|
553
|
-
const newManifest = {
|
|
554
|
-
files: currentFiles,
|
|
555
|
-
timestamp: new Date().toISOString(),
|
|
556
|
-
};
|
|
557
|
-
saveManifest(config.outDir, newManifest);
|
|
558
|
-
|
|
559
682
|
console.log("\n⨠Extraction completed!");
|
|
560
683
|
console.log(`š Total: ${totalPages} pages, ${totalTokens} tokens`);
|
|
561
684
|
if (totalErrors > 0) {
|
|
562
685
|
console.log(`ā ļø Errors: ${totalErrors}`);
|
|
563
686
|
}
|
|
564
|
-
console.log(
|
|
687
|
+
console.log(
|
|
688
|
+
`š Top-level output directory: ${path.resolve(config.outDir)}`
|
|
689
|
+
);
|
|
565
690
|
console.log("\nš” Use --clear-credentials to remove stored API key");
|
|
566
691
|
} catch (error) {
|
|
567
692
|
console.error("š„ Fatal error:", error.message);
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "extract-from-sitemap",
|
|
3
3
|
"bin": "cli.js",
|
|
4
|
-
"version": "0.0.
|
|
4
|
+
"version": "0.0.6",
|
|
5
5
|
"main": "mod.js",
|
|
6
6
|
"description": "A module and CLI that allows extracting all pages from a sitemap into markdown and a llms.txt, using Parallel.ai APIs.",
|
|
7
7
|
"files": [
|