extract-from-sitemap 0.0.5 ā 0.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/cli.js +244 -106
- package/mod.js +6 -51
- package/package.json +1 -1
package/cli.js
CHANGED
|
@@ -10,17 +10,19 @@ const os = require("os");
|
|
|
10
10
|
const { extractFromSitemap } = require("./mod.js");
|
|
11
11
|
|
|
12
12
|
/**
|
|
13
|
-
* @typedef {Object}
|
|
14
|
-
* @property {string}
|
|
15
|
-
* @property {
|
|
13
|
+
* @typedef {Object} SourceConfig
|
|
14
|
+
* @property {string} title - The title for this source
|
|
15
|
+
* @property {string} [origin] - The origin URL to process (optional)
|
|
16
|
+
* @property {string} [outDir] - Output directory for this source's extracted files
|
|
17
|
+
* @property {boolean} [forceExtract] - Whether to force extraction for this source
|
|
18
|
+
* @property {boolean} [keepOriginalUrls] - Whether to keep original URL structure and not save files locally
|
|
19
|
+
* @property {Array<{title: string, description: string, filename: string, url: string}>} [customUrls] - Custom URLs to extract for this source
|
|
16
20
|
*/
|
|
17
21
|
|
|
18
22
|
/**
|
|
19
23
|
* @typedef {Object} Config
|
|
20
|
-
* @property {string} outDir -
|
|
21
|
-
* @property {
|
|
22
|
-
* @property {Array<{title: string, description: string, url: string}>} customUrls - Custom URLs to extract
|
|
23
|
-
* @property {boolean} keepOriginalUrls - Whether to keep original URL structure
|
|
24
|
+
* @property {string} outDir - Top-level output directory for combined llms.txt
|
|
25
|
+
* @property {SourceConfig[]} sources - Array of source configurations
|
|
24
26
|
*/
|
|
25
27
|
|
|
26
28
|
/**
|
|
@@ -217,12 +219,49 @@ async function loadConfig() {
|
|
|
217
219
|
{
|
|
218
220
|
$schema: "https://extract.llmtext.com/llmtext.schema.json",
|
|
219
221
|
outDir: "./docs",
|
|
220
|
-
|
|
221
|
-
{
|
|
222
|
-
|
|
222
|
+
sources: [
|
|
223
|
+
{
|
|
224
|
+
title: "Parallel AI Documentation",
|
|
225
|
+
origin: "https://docs.parallel.ai",
|
|
226
|
+
forceExtract: false,
|
|
227
|
+
outDir: "./docs/parallel-docs",
|
|
228
|
+
keepOriginalUrls: false,
|
|
229
|
+
},
|
|
230
|
+
{
|
|
231
|
+
title: "Parallel AI Website",
|
|
232
|
+
origin: "https://parallel.ai",
|
|
233
|
+
forceExtract: true,
|
|
234
|
+
outDir: "./docs/parallel-main",
|
|
235
|
+
keepOriginalUrls: false,
|
|
236
|
+
},
|
|
237
|
+
{
|
|
238
|
+
title: "Custom Resources",
|
|
239
|
+
forceExtract: true,
|
|
240
|
+
outDir: "./docs/custom",
|
|
241
|
+
keepOriginalUrls: false,
|
|
242
|
+
customUrls: [
|
|
243
|
+
{
|
|
244
|
+
title: "Custom Page",
|
|
245
|
+
description: "A custom page to extract",
|
|
246
|
+
filename: "custom-page",
|
|
247
|
+
url: "https://example.com/page",
|
|
248
|
+
},
|
|
249
|
+
],
|
|
250
|
+
},
|
|
251
|
+
{
|
|
252
|
+
title: "External References",
|
|
253
|
+
keepOriginalUrls: true,
|
|
254
|
+
forceExtract: false,
|
|
255
|
+
customUrls: [
|
|
256
|
+
{
|
|
257
|
+
title: "External API Guide",
|
|
258
|
+
description: "Third-party API documentation",
|
|
259
|
+
filename: "external-api",
|
|
260
|
+
url: "https://external.com/api-guide",
|
|
261
|
+
},
|
|
262
|
+
],
|
|
263
|
+
},
|
|
223
264
|
],
|
|
224
|
-
customUrls: [],
|
|
225
|
-
keepOriginalUrls: false,
|
|
226
265
|
},
|
|
227
266
|
null,
|
|
228
267
|
2
|
|
@@ -236,25 +275,56 @@ async function loadConfig() {
|
|
|
236
275
|
|
|
237
276
|
// Validate required fields
|
|
238
277
|
if (!config.outDir) throw new Error("outDir is required");
|
|
239
|
-
if (!Array.isArray(config.
|
|
240
|
-
throw new Error("
|
|
278
|
+
if (!Array.isArray(config.sources))
|
|
279
|
+
throw new Error("sources must be an array");
|
|
241
280
|
|
|
242
|
-
// Validate
|
|
243
|
-
for (const [index,
|
|
244
|
-
if (typeof
|
|
245
|
-
throw new Error(`
|
|
281
|
+
// Validate source objects
|
|
282
|
+
for (const [index, sourceConfig] of config.sources.entries()) {
|
|
283
|
+
if (typeof sourceConfig !== "object" || sourceConfig === null) {
|
|
284
|
+
throw new Error(`sources[${index}] must be an object`);
|
|
246
285
|
}
|
|
247
|
-
if (!
|
|
248
|
-
throw new Error(`
|
|
286
|
+
if (!sourceConfig.title) {
|
|
287
|
+
throw new Error(`sources[${index}].title is required`);
|
|
249
288
|
}
|
|
250
|
-
|
|
251
|
-
|
|
289
|
+
|
|
290
|
+
// Set defaults
|
|
291
|
+
sourceConfig.forceExtract = sourceConfig.forceExtract ?? false;
|
|
292
|
+
sourceConfig.keepOriginalUrls = sourceConfig.keepOriginalUrls ?? false;
|
|
293
|
+
sourceConfig.customUrls = sourceConfig.customUrls || [];
|
|
294
|
+
|
|
295
|
+
// Either origin or customUrls must be provided
|
|
296
|
+
if (
|
|
297
|
+
!sourceConfig.origin &&
|
|
298
|
+
(!sourceConfig.customUrls || sourceConfig.customUrls.length === 0)
|
|
299
|
+
) {
|
|
300
|
+
throw new Error(
|
|
301
|
+
`sources[${index}] must have either origin or customUrls`
|
|
302
|
+
);
|
|
252
303
|
}
|
|
253
|
-
}
|
|
254
304
|
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
305
|
+
// outDir is required unless keepOriginalUrls is true
|
|
306
|
+
if (!sourceConfig.outDir && !sourceConfig.keepOriginalUrls) {
|
|
307
|
+
throw new Error(
|
|
308
|
+
`sources[${index}].outDir is required when keepOriginalUrls is false`
|
|
309
|
+
);
|
|
310
|
+
}
|
|
311
|
+
|
|
312
|
+
// Validate customUrls
|
|
313
|
+
for (const [urlIndex, customUrl] of (
|
|
314
|
+
sourceConfig.customUrls || []
|
|
315
|
+
).entries()) {
|
|
316
|
+
if (
|
|
317
|
+
!customUrl.title ||
|
|
318
|
+
!customUrl.description ||
|
|
319
|
+
!customUrl.filename ||
|
|
320
|
+
!customUrl.url
|
|
321
|
+
) {
|
|
322
|
+
throw new Error(
|
|
323
|
+
`sources[${index}].customUrls[${urlIndex}] must have title, description, filename, and url`
|
|
324
|
+
);
|
|
325
|
+
}
|
|
326
|
+
}
|
|
327
|
+
}
|
|
258
328
|
|
|
259
329
|
return config;
|
|
260
330
|
} catch (error) {
|
|
@@ -389,7 +459,7 @@ function cleanupOldFiles(outDir, currentFiles, previousFiles) {
|
|
|
389
459
|
|
|
390
460
|
/**
|
|
391
461
|
* Process custom URLs through extraction API
|
|
392
|
-
* @param {Array<{title: string, description: string, url: string}>} customUrls - Custom URLs to process
|
|
462
|
+
* @param {Array<{title: string, description: string, filename: string, url: string}>} customUrls - Custom URLs to process
|
|
393
463
|
* @param {string} apiKey - API key for authentication
|
|
394
464
|
* @returns {Promise<Record<string, any>>} Extracted files
|
|
395
465
|
*/
|
|
@@ -417,8 +487,7 @@ async function processCustomUrls(customUrls, apiKey) {
|
|
|
417
487
|
const result = await response.json();
|
|
418
488
|
if (result.results && result.results.length > 0) {
|
|
419
489
|
const extracted = result.results[0];
|
|
420
|
-
const filename =
|
|
421
|
-
customUrl.title.replace(/[^a-zA-Z0-9]/g, "_").toLowerCase() + ".md";
|
|
490
|
+
const filename = customUrl.filename + ".md";
|
|
422
491
|
|
|
423
492
|
files[filename] = {
|
|
424
493
|
content: extracted.full_content || "",
|
|
@@ -428,6 +497,7 @@ async function processCustomUrls(customUrls, apiKey) {
|
|
|
428
497
|
publishedDate: extracted.published_date || "",
|
|
429
498
|
status: 200,
|
|
430
499
|
tokens: Math.round((extracted.full_content || "").length / 5),
|
|
500
|
+
originalUrl: customUrl.url,
|
|
431
501
|
};
|
|
432
502
|
}
|
|
433
503
|
} else {
|
|
@@ -444,6 +514,43 @@ async function processCustomUrls(customUrls, apiKey) {
|
|
|
444
514
|
return files;
|
|
445
515
|
}
|
|
446
516
|
|
|
517
|
+
/**
|
|
518
|
+
* Generate combined llms.txt from all sources
|
|
519
|
+
* @param {Array<{title: string, files: Record<string, any>, keepOriginalUrls?: boolean}>} allSources - All processed sources
|
|
520
|
+
* @returns {string} Combined llms.txt content
|
|
521
|
+
*/
|
|
522
|
+
function generateCombinedLlmsTxt(allSources) {
|
|
523
|
+
let combinedTxt =
|
|
524
|
+
"# Documentation Collection\n\n> Combined documentation from multiple sources\n\n";
|
|
525
|
+
|
|
526
|
+
for (const source of allSources) {
|
|
527
|
+
combinedTxt += `## ${source.title}\n\n`;
|
|
528
|
+
|
|
529
|
+
// Sort files by path for consistent ordering
|
|
530
|
+
const sortedFiles = Object.entries(source.files).sort(([a], [b]) =>
|
|
531
|
+
a.localeCompare(b)
|
|
532
|
+
);
|
|
533
|
+
|
|
534
|
+
for (const [path, file] of sortedFiles) {
|
|
535
|
+
if (file.content || file.title) {
|
|
536
|
+
const title = file.title || path.replace(".md", "");
|
|
537
|
+
const description = file.description ? `: ${file.description}` : "";
|
|
538
|
+
|
|
539
|
+
// If keepOriginalUrls is true, link to the original URL, otherwise link to the local file
|
|
540
|
+
const link = source.keepOriginalUrls
|
|
541
|
+
? file.originalUrl
|
|
542
|
+
: path.replace(".md", "");
|
|
543
|
+
|
|
544
|
+
combinedTxt += `- [${title}](${link}) (${file.tokens} tokens)${description}\n`;
|
|
545
|
+
}
|
|
546
|
+
}
|
|
547
|
+
|
|
548
|
+
combinedTxt += "\n";
|
|
549
|
+
}
|
|
550
|
+
|
|
551
|
+
return combinedTxt;
|
|
552
|
+
}
|
|
553
|
+
|
|
447
554
|
/**
|
|
448
555
|
* Clear stored API key credentials
|
|
449
556
|
*/
|
|
@@ -477,114 +584,145 @@ async function main() {
|
|
|
477
584
|
const config = await loadConfig();
|
|
478
585
|
const apiKey = await getApiKey();
|
|
479
586
|
|
|
480
|
-
// Ensure output directory exists
|
|
587
|
+
// Ensure top-level output directory exists
|
|
481
588
|
fs.mkdirSync(config.outDir, { recursive: true });
|
|
482
589
|
|
|
483
|
-
|
|
484
|
-
const previousManifest = loadManifest(config.outDir);
|
|
485
|
-
const currentFiles = [];
|
|
486
|
-
|
|
590
|
+
const allSources = [];
|
|
487
591
|
let totalTokens = 0;
|
|
488
592
|
let totalPages = 0;
|
|
489
593
|
let totalErrors = 0;
|
|
490
594
|
|
|
491
|
-
// Process each
|
|
492
|
-
for (const
|
|
595
|
+
// Process each source
|
|
596
|
+
for (const [sourceIndex, sourceConfig] of config.sources.entries()) {
|
|
597
|
+
const sourceName = `${sourceConfig.title} (source ${sourceIndex + 1})`;
|
|
598
|
+
|
|
493
599
|
console.log(
|
|
494
|
-
`\nš Processing
|
|
600
|
+
`\nš Processing ${sourceName} (forceExtract: ${sourceConfig.forceExtract}, keepOriginalUrls: ${sourceConfig.keepOriginalUrls})`
|
|
495
601
|
);
|
|
496
602
|
|
|
603
|
+
// Only ensure source output directory exists if not keeping original URLs
|
|
604
|
+
if (!sourceConfig.keepOriginalUrls && sourceConfig.outDir) {
|
|
605
|
+
fs.mkdirSync(sourceConfig.outDir, { recursive: true });
|
|
606
|
+
}
|
|
607
|
+
|
|
608
|
+
// Load previous manifest for this source (only if we have an outDir)
|
|
609
|
+
const previousManifest = sourceConfig.outDir
|
|
610
|
+
? loadManifest(sourceConfig.outDir)
|
|
611
|
+
: { files: [], timestamp: new Date().toISOString() };
|
|
612
|
+
const currentFiles = [];
|
|
613
|
+
let sourceFiles = {};
|
|
614
|
+
|
|
497
615
|
try {
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
616
|
+
// Process origin if provided
|
|
617
|
+
if (sourceConfig.origin) {
|
|
618
|
+
const result = await extractFromSitemap(
|
|
619
|
+
sourceConfig.origin,
|
|
620
|
+
sourceConfig.forceExtract,
|
|
621
|
+
apiKey
|
|
622
|
+
);
|
|
623
|
+
|
|
624
|
+
console.log(
|
|
625
|
+
`ā
Extracted ${result.totalPages} pages with ${result.totalTokens} tokens`
|
|
626
|
+
);
|
|
627
|
+
if (result.errors > 0) {
|
|
628
|
+
console.log(`ā ļø ${result.errors} errors occurred`);
|
|
629
|
+
}
|
|
503
630
|
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
console.log(`ā ļø ${result.errors} errors occurred`);
|
|
631
|
+
sourceFiles = result.files;
|
|
632
|
+
totalTokens += result.totalTokens;
|
|
633
|
+
totalPages += result.totalPages;
|
|
634
|
+
totalErrors += result.errors;
|
|
509
635
|
}
|
|
510
636
|
|
|
511
|
-
//
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
);
|
|
528
|
-
} else {
|
|
529
|
-
filename = filePath.startsWith("/") ? filePath.slice(1) : filePath;
|
|
637
|
+
// Process custom URLs for this source
|
|
638
|
+
if (sourceConfig.customUrls && sourceConfig.customUrls.length > 0) {
|
|
639
|
+
console.log(
|
|
640
|
+
`š Processing ${sourceConfig.customUrls.length} custom URLs for this source...`
|
|
641
|
+
);
|
|
642
|
+
const customFiles = await processCustomUrls(
|
|
643
|
+
sourceConfig.customUrls,
|
|
644
|
+
apiKey
|
|
645
|
+
);
|
|
646
|
+
|
|
647
|
+
// Merge custom files with sitemap files
|
|
648
|
+
sourceFiles = { ...sourceFiles, ...customFiles };
|
|
649
|
+
|
|
650
|
+
for (const file of Object.values(customFiles)) {
|
|
651
|
+
totalTokens += file.tokens;
|
|
652
|
+
totalPages++;
|
|
530
653
|
}
|
|
654
|
+
}
|
|
531
655
|
|
|
532
|
-
|
|
533
|
-
|
|
656
|
+
// Write files to source directory (only if not keeping original URLs)
|
|
657
|
+
if (!sourceConfig.keepOriginalUrls && sourceConfig.outDir) {
|
|
658
|
+
for (const [filePath, file] of Object.entries(sourceFiles)) {
|
|
659
|
+
let filename = filePath.startsWith("/")
|
|
660
|
+
? filePath.slice(1)
|
|
661
|
+
: filePath;
|
|
534
662
|
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
currentFiles.push(filename);
|
|
663
|
+
const fullFilePath = path.join(sourceConfig.outDir, filename);
|
|
664
|
+
const fileDir = path.dirname(fullFilePath);
|
|
538
665
|
|
|
539
|
-
|
|
540
|
-
|
|
666
|
+
fs.mkdirSync(fileDir, { recursive: true });
|
|
667
|
+
fs.writeFileSync(fullFilePath, file.content);
|
|
668
|
+
currentFiles.push(filename);
|
|
541
669
|
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
error.message
|
|
549
|
-
);
|
|
550
|
-
totalErrors++;
|
|
551
|
-
}
|
|
552
|
-
}
|
|
670
|
+
console.log(
|
|
671
|
+
`š Wrote: ${path.join(sourceConfig.outDir, filename)} (${
|
|
672
|
+
file.tokens
|
|
673
|
+
} tokens)`
|
|
674
|
+
);
|
|
675
|
+
}
|
|
553
676
|
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
677
|
+
// Clean up old files for this source
|
|
678
|
+
if (previousManifest.files.length > 0) {
|
|
679
|
+
cleanupOldFiles(
|
|
680
|
+
sourceConfig.outDir,
|
|
681
|
+
currentFiles,
|
|
682
|
+
previousManifest.files
|
|
683
|
+
);
|
|
684
|
+
}
|
|
558
685
|
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
686
|
+
// Save manifest for this source
|
|
687
|
+
const newManifest = {
|
|
688
|
+
files: currentFiles,
|
|
689
|
+
timestamp: new Date().toISOString(),
|
|
690
|
+
};
|
|
691
|
+
saveManifest(sourceConfig.outDir, newManifest);
|
|
692
|
+
} else {
|
|
693
|
+
console.log(
|
|
694
|
+
`š Keeping original URLs - not saving files locally for ${sourceName}`
|
|
695
|
+
);
|
|
696
|
+
}
|
|
565
697
|
|
|
566
|
-
|
|
698
|
+
// Add to all sources for combined llms.txt
|
|
699
|
+
allSources.push({
|
|
700
|
+
title: sourceConfig.title,
|
|
701
|
+
files: sourceFiles,
|
|
702
|
+
keepOriginalUrls: sourceConfig.keepOriginalUrls,
|
|
703
|
+
});
|
|
704
|
+
} catch (error) {
|
|
705
|
+
console.error(`ā Error processing ${sourceName}:`, error.message);
|
|
706
|
+
totalErrors++;
|
|
567
707
|
}
|
|
568
708
|
}
|
|
569
709
|
|
|
570
|
-
//
|
|
571
|
-
if (
|
|
572
|
-
|
|
710
|
+
// Generate and write combined llms.txt to top-level outDir
|
|
711
|
+
if (allSources.length > 0) {
|
|
712
|
+
const combinedLlmsTxt = generateCombinedLlmsTxt(allSources);
|
|
713
|
+
const combinedLlmsTxtPath = path.join(config.outDir, "llms.txt");
|
|
714
|
+
fs.writeFileSync(combinedLlmsTxtPath, combinedLlmsTxt);
|
|
715
|
+
console.log(`\nš Generated combined llms.txt: ${combinedLlmsTxtPath}`);
|
|
573
716
|
}
|
|
574
717
|
|
|
575
|
-
// Save new manifest
|
|
576
|
-
const newManifest = {
|
|
577
|
-
files: currentFiles,
|
|
578
|
-
timestamp: new Date().toISOString(),
|
|
579
|
-
};
|
|
580
|
-
saveManifest(config.outDir, newManifest);
|
|
581
|
-
|
|
582
718
|
console.log("\n⨠Extraction completed!");
|
|
583
719
|
console.log(`š Total: ${totalPages} pages, ${totalTokens} tokens`);
|
|
584
720
|
if (totalErrors > 0) {
|
|
585
721
|
console.log(`ā ļø Errors: ${totalErrors}`);
|
|
586
722
|
}
|
|
587
|
-
console.log(
|
|
723
|
+
console.log(
|
|
724
|
+
`š Top-level output directory: ${path.resolve(config.outDir)}`
|
|
725
|
+
);
|
|
588
726
|
console.log("\nš” Use --clear-credentials to remove stored API key");
|
|
589
727
|
} catch (error) {
|
|
590
728
|
console.error("š„ Fatal error:", error.message);
|
package/mod.js
CHANGED
|
@@ -8,6 +8,7 @@
|
|
|
8
8
|
* @property {boolean} extracted - Whether the content was extracted or directly fetched
|
|
9
9
|
* @property {number} status - HTTP status code or processing status
|
|
10
10
|
* @property {number} tokens - Number of tokens in the content
|
|
11
|
+
* @property {string} originalUrl - The original URL of the content
|
|
11
12
|
*/
|
|
12
13
|
|
|
13
14
|
/**
|
|
@@ -62,6 +63,7 @@ export async function extractFromSitemap(origin, forceExtract = false, apiKey) {
|
|
|
62
63
|
status: result.status,
|
|
63
64
|
tokens: Math.round(result.content.length / 5),
|
|
64
65
|
publishedDate: result.publishedDate || "",
|
|
66
|
+
originalUrl: urlStr,
|
|
65
67
|
error: result.error,
|
|
66
68
|
};
|
|
67
69
|
|
|
@@ -80,6 +82,7 @@ export async function extractFromSitemap(origin, forceExtract = false, apiKey) {
|
|
|
80
82
|
status: 0,
|
|
81
83
|
tokens: 0,
|
|
82
84
|
publishedDate: "",
|
|
85
|
+
originalUrl: urlStr,
|
|
83
86
|
};
|
|
84
87
|
if (!forceExtract) {
|
|
85
88
|
urlsNeedingExtract.push(urlStr);
|
|
@@ -108,6 +111,7 @@ export async function extractFromSitemap(origin, forceExtract = false, apiKey) {
|
|
|
108
111
|
status: 0,
|
|
109
112
|
tokens: 0,
|
|
110
113
|
publishedDate: "",
|
|
114
|
+
originalUrl: result.url,
|
|
111
115
|
};
|
|
112
116
|
|
|
113
117
|
const content = result.full_content || existing.content;
|
|
@@ -122,6 +126,7 @@ export async function extractFromSitemap(origin, forceExtract = false, apiKey) {
|
|
|
122
126
|
publishedDate: result.published_date || existing.publishedDate,
|
|
123
127
|
status: existing.status,
|
|
124
128
|
tokens: Math.round(content.length / 5),
|
|
129
|
+
originalUrl: existing.originalUrl,
|
|
125
130
|
};
|
|
126
131
|
}
|
|
127
132
|
|
|
@@ -137,18 +142,6 @@ export async function extractFromSitemap(origin, forceExtract = false, apiKey) {
|
|
|
137
142
|
}
|
|
138
143
|
}
|
|
139
144
|
|
|
140
|
-
// Generate llms.txt
|
|
141
|
-
const llmsTxt = generateLlmsTxt(origin, files);
|
|
142
|
-
files["/llms.txt"] = {
|
|
143
|
-
content: llmsTxt,
|
|
144
|
-
title: "LLMs.txt",
|
|
145
|
-
description: "LLM-friendly content listing",
|
|
146
|
-
extracted: false,
|
|
147
|
-
publishedDate: "",
|
|
148
|
-
status: 200,
|
|
149
|
-
tokens: Math.round(llmsTxt.length / 5),
|
|
150
|
-
};
|
|
151
|
-
|
|
152
145
|
// Sort files by path
|
|
153
146
|
const sortedFiles = Object.keys(files)
|
|
154
147
|
.sort()
|
|
@@ -162,7 +155,7 @@ export async function extractFromSitemap(origin, forceExtract = false, apiKey) {
|
|
|
162
155
|
(sum, file) => sum + file.tokens,
|
|
163
156
|
0
|
|
164
157
|
);
|
|
165
|
-
const totalPages = Object.keys(sortedFiles).length
|
|
158
|
+
const totalPages = Object.keys(sortedFiles).length;
|
|
166
159
|
const errors = Object.values(sortedFiles).filter((file) => file.error).length;
|
|
167
160
|
const processingTimeMs = Date.now() - startTime;
|
|
168
161
|
|
|
@@ -527,44 +520,6 @@ function getPathFromUrl(urlStr) {
|
|
|
527
520
|
}
|
|
528
521
|
}
|
|
529
522
|
|
|
530
|
-
/**
|
|
531
|
-
* Generate llms.txt content
|
|
532
|
-
* @param {string} origin - Site origin
|
|
533
|
-
* @param {Record<string, any>} files - Files object
|
|
534
|
-
* @returns {string} Generated llms.txt content
|
|
535
|
-
*/
|
|
536
|
-
function generateLlmsTxt(origin, files) {
|
|
537
|
-
// Find homepage for top-level description
|
|
538
|
-
const homepageFile = files["/index.html.md"] || files[Object.keys(files)[0]];
|
|
539
|
-
const siteTitle =
|
|
540
|
-
homepageFile?.title ||
|
|
541
|
-
new URL(origin.startsWith("http") ? origin : `https://${origin}`).hostname;
|
|
542
|
-
const siteDescription =
|
|
543
|
-
homepageFile?.description || `Documentation for ${siteTitle}`;
|
|
544
|
-
|
|
545
|
-
let llmsTxt = `# ${siteTitle}\n\n> ${siteDescription}\n\n`;
|
|
546
|
-
|
|
547
|
-
// Add documentation section
|
|
548
|
-
llmsTxt += "## Documentation\n\n";
|
|
549
|
-
|
|
550
|
-
// Sort files by path for consistent ordering
|
|
551
|
-
const sortedFiles = Object.entries(files)
|
|
552
|
-
.filter(([path]) => path !== "/llms.txt")
|
|
553
|
-
.sort(([a], [b]) => a.localeCompare(b));
|
|
554
|
-
|
|
555
|
-
for (const [path, file] of sortedFiles) {
|
|
556
|
-
if (file.content || file.title) {
|
|
557
|
-
const title = file.title || path.replace(".md", "");
|
|
558
|
-
const description = file.description ? `: ${file.description}` : "";
|
|
559
|
-
llmsTxt += `- [${title}](${path.replace(".md", "")}) (${
|
|
560
|
-
file.tokens
|
|
561
|
-
} tokens)${description}\n`;
|
|
562
|
-
}
|
|
563
|
-
}
|
|
564
|
-
|
|
565
|
-
return llmsTxt;
|
|
566
|
-
}
|
|
567
|
-
|
|
568
523
|
/**
|
|
569
524
|
* Call Parallel Extract API for multiple URLs
|
|
570
525
|
* @param {string[]} urls - URLs to extract
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "extract-from-sitemap",
|
|
3
3
|
"bin": "cli.js",
|
|
4
|
-
"version": "0.0.
|
|
4
|
+
"version": "0.0.7",
|
|
5
5
|
"main": "mod.js",
|
|
6
6
|
"description": "A module and CLI that allows extracting all pages from a sitemap into markdown and a llms.txt, using Parallel.ai APIs.",
|
|
7
7
|
"files": [
|