extract-from-sitemap 0.0.12 ā 0.0.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +18 -0
- package/cli.js +23 -86
- package/package.json +1 -1
package/README.md
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
This repo allows you to create a static markdown bundle based on one or multiple sources. The sources must either have a functional and complete sitemap, or should specify custom urls to be extracted.
|
|
2
|
+
|
|
3
|
+
## Step by Step Guide
|
|
4
|
+
|
|
5
|
+
1. Create a `llmtext.json` file in the root of your project. This is where you define your sources to be extracted from. For an example combining multiple sources, see [this example](https://github.com/janwilmake/parallel-llmtext/blob/main/llmtext.json).
|
|
6
|
+
2. Run `npx extract-from-sitemap` (or add it to your `package.json` scripts, [like this](https://github.com/janwilmake/parallel-llmtext/blob/main/package.json))
|
|
7
|
+
3. Set up CI/CD in your repo to automatically update your extracted static files as often as needed. **Example coming soon**
|
|
8
|
+
4. Use an agent-rewriter such as [next-agent-rewriter](../next-agent-rewriter) to rewrite agent requests to the appropriate static markdown files. In addition, it's best practice to add a link in your html to show the markdown variant is available, like this: `<link rel="alternate" type="text/markdown" href="{path}.md" title="Docs" />`
|
|
9
|
+
|
|
10
|
+
## Known limitations
|
|
11
|
+
|
|
12
|
+
This library is in active development. Known limitations:
|
|
13
|
+
|
|
14
|
+
- Does not work for nested sitemaps
|
|
15
|
+
- Does not work on sitemaps that are too large
|
|
16
|
+
- Example to make it recurring is still missing
|
|
17
|
+
|
|
18
|
+
I am working on addressing these issues.
|
package/cli.js
CHANGED
|
@@ -21,16 +21,12 @@ const { extractFromSitemap } = require("./mod.js");
|
|
|
21
21
|
*/
|
|
22
22
|
/**
|
|
23
23
|
* @typedef {Object} Config
|
|
24
|
+
* @property {string} description - Description of the documentation collection
|
|
25
|
+
* @property {string} [details] - Optional additional details about the collection
|
|
24
26
|
* @property {string} outDir - Top-level output directory for combined llms.txt
|
|
25
27
|
* @property {SourceConfig[]} sources - Array of source configurations
|
|
26
28
|
*/
|
|
27
29
|
|
|
28
|
-
/**
|
|
29
|
-
* @typedef {Object} Manifest
|
|
30
|
-
* @property {string[]} files - List of generated files
|
|
31
|
-
* @property {string} timestamp - Timestamp of last generation
|
|
32
|
-
*/
|
|
33
|
-
|
|
34
30
|
const CREDENTIALS_DIR = path.join(os.homedir(), ".llmtext");
|
|
35
31
|
const API_KEY_FILE = path.join(CREDENTIALS_DIR, "api-key");
|
|
36
32
|
|
|
@@ -218,6 +214,9 @@ async function loadConfig() {
|
|
|
218
214
|
JSON.stringify(
|
|
219
215
|
{
|
|
220
216
|
$schema: "https://extract.llmtext.com/llmtext.schema.json",
|
|
217
|
+
description: "Combined documentation from multiple sources",
|
|
218
|
+
details:
|
|
219
|
+
"This collection includes API documentation, guides, and references.",
|
|
221
220
|
outDir: "./docs",
|
|
222
221
|
sources: [
|
|
223
222
|
{
|
|
@@ -274,6 +273,7 @@ async function loadConfig() {
|
|
|
274
273
|
const config = JSON.parse(fs.readFileSync(configPath, "utf8"));
|
|
275
274
|
|
|
276
275
|
// Validate required fields
|
|
276
|
+
if (!config.description) throw new Error("description is required");
|
|
277
277
|
if (!config.outDir) throw new Error("outDir is required");
|
|
278
278
|
if (!Array.isArray(config.sources))
|
|
279
279
|
throw new Error("sources must be an array");
|
|
@@ -416,59 +416,6 @@ async function getApiKey() {
|
|
|
416
416
|
return newApiKey;
|
|
417
417
|
}
|
|
418
418
|
|
|
419
|
-
/**
|
|
420
|
-
* Load manifest file
|
|
421
|
-
* @param {string} outDir - Output directory
|
|
422
|
-
* @returns {Manifest} The manifest object
|
|
423
|
-
*/
|
|
424
|
-
function loadManifest(outDir) {
|
|
425
|
-
const manifestPath = path.join(outDir, "llmtext-manifest.json");
|
|
426
|
-
|
|
427
|
-
if (!fs.existsSync(manifestPath)) {
|
|
428
|
-
return { files: [], timestamp: new Date().toISOString() };
|
|
429
|
-
}
|
|
430
|
-
|
|
431
|
-
try {
|
|
432
|
-
return JSON.parse(fs.readFileSync(manifestPath, "utf8"));
|
|
433
|
-
} catch {
|
|
434
|
-
return { files: [], timestamp: new Date().toISOString() };
|
|
435
|
-
}
|
|
436
|
-
}
|
|
437
|
-
|
|
438
|
-
/**
|
|
439
|
-
* Save manifest file
|
|
440
|
-
* @param {string} outDir - Output directory
|
|
441
|
-
* @param {Manifest} manifest - The manifest to save
|
|
442
|
-
*/
|
|
443
|
-
function saveManifest(outDir, manifest) {
|
|
444
|
-
const manifestPath = path.join(outDir, "llmtext-manifest.json");
|
|
445
|
-
fs.writeFileSync(manifestPath, JSON.stringify(manifest, null, 2));
|
|
446
|
-
}
|
|
447
|
-
|
|
448
|
-
/**
|
|
449
|
-
* Clean up old files that are no longer generated
|
|
450
|
-
* @param {string} outDir - Output directory
|
|
451
|
-
* @param {string[]} currentFiles - Currently generated files
|
|
452
|
-
* @param {string[]} previousFiles - Previously generated files
|
|
453
|
-
*/
|
|
454
|
-
function cleanupOldFiles(outDir, currentFiles, previousFiles) {
|
|
455
|
-
const filesToRemove = previousFiles.filter(
|
|
456
|
-
(file) => !currentFiles.includes(file)
|
|
457
|
-
);
|
|
458
|
-
|
|
459
|
-
for (const file of filesToRemove) {
|
|
460
|
-
const filePath = path.join(outDir, file);
|
|
461
|
-
try {
|
|
462
|
-
if (fs.existsSync(filePath)) {
|
|
463
|
-
fs.rmSync(filePath);
|
|
464
|
-
console.log(`šļø Removed old file: ${file}`);
|
|
465
|
-
}
|
|
466
|
-
} catch (error) {
|
|
467
|
-
console.warn(`ā ļø Could not remove ${file}:`, error.message);
|
|
468
|
-
}
|
|
469
|
-
}
|
|
470
|
-
}
|
|
471
|
-
|
|
472
419
|
/**
|
|
473
420
|
* Process custom URLs through extraction API
|
|
474
421
|
* @param {Array<{title: string, description: string, filename: string, url: string}>} customUrls - Custom URLs to process
|
|
@@ -546,12 +493,17 @@ function getPathPrefix(topLevelOutDir, sourceOutDir) {
|
|
|
546
493
|
|
|
547
494
|
/**
|
|
548
495
|
* Generate combined llms.txt from all sources
|
|
496
|
+
* @param {string} description - Top-level description
|
|
497
|
+
* @param {string} [details] - Optional top-level details
|
|
549
498
|
* @param {Array<{title: string, files: Record<string, any>, keepOriginalUrls?: boolean, pathPrefix: string}>} allSources - All processed sources
|
|
550
499
|
* @returns {string} Combined llms.txt content
|
|
551
500
|
*/
|
|
552
|
-
function generateCombinedLlmsTxt(allSources) {
|
|
553
|
-
let combinedTxt =
|
|
554
|
-
|
|
501
|
+
function generateCombinedLlmsTxt(description, details, allSources) {
|
|
502
|
+
let combinedTxt = `# Documentation Collection\n\n> ${description}\n\n`;
|
|
503
|
+
|
|
504
|
+
if (details) {
|
|
505
|
+
combinedTxt += `${details}\n\n`;
|
|
506
|
+
}
|
|
555
507
|
|
|
556
508
|
for (const source of allSources) {
|
|
557
509
|
combinedTxt += `## ${source.title}\n\n`;
|
|
@@ -574,7 +526,10 @@ function generateCombinedLlmsTxt(allSources) {
|
|
|
574
526
|
link = source.pathPrefix + (path.startsWith("/") ? path : "/" + path);
|
|
575
527
|
}
|
|
576
528
|
|
|
577
|
-
combinedTxt += `- [${title}](${link})
|
|
529
|
+
combinedTxt += `- [${title}](${link}): ${description.replaceAll(
|
|
530
|
+
"\n",
|
|
531
|
+
" "
|
|
532
|
+
)}\n`;
|
|
578
533
|
}
|
|
579
534
|
}
|
|
580
535
|
|
|
@@ -638,11 +593,6 @@ async function main() {
|
|
|
638
593
|
fs.mkdirSync(sourceConfig.outDir, { recursive: true });
|
|
639
594
|
}
|
|
640
595
|
|
|
641
|
-
// Load previous manifest for this source (only if we have an outDir and not keeping original URLs)
|
|
642
|
-
const previousManifest = !sourceConfig.keepOriginalUrls
|
|
643
|
-
? loadManifest(sourceConfig.outDir)
|
|
644
|
-
: { files: [], timestamp: new Date().toISOString() };
|
|
645
|
-
const currentFiles = [];
|
|
646
596
|
let sourceFiles = {};
|
|
647
597
|
|
|
648
598
|
try {
|
|
@@ -699,7 +649,6 @@ async function main() {
|
|
|
699
649
|
|
|
700
650
|
fs.mkdirSync(fileDir, { recursive: true });
|
|
701
651
|
fs.writeFileSync(fullFilePath, file.content);
|
|
702
|
-
currentFiles.push(filename);
|
|
703
652
|
|
|
704
653
|
console.log(
|
|
705
654
|
`š Wrote: ${path.join(sourceConfig.outDir, filename)} (${
|
|
@@ -707,22 +656,6 @@ async function main() {
|
|
|
707
656
|
} tokens)`
|
|
708
657
|
);
|
|
709
658
|
}
|
|
710
|
-
|
|
711
|
-
// Clean up old files for this source
|
|
712
|
-
if (previousManifest.files.length > 0) {
|
|
713
|
-
cleanupOldFiles(
|
|
714
|
-
sourceConfig.outDir,
|
|
715
|
-
currentFiles,
|
|
716
|
-
previousManifest.files
|
|
717
|
-
);
|
|
718
|
-
}
|
|
719
|
-
|
|
720
|
-
// Save manifest for this source
|
|
721
|
-
const newManifest = {
|
|
722
|
-
files: currentFiles,
|
|
723
|
-
timestamp: new Date().toISOString(),
|
|
724
|
-
};
|
|
725
|
-
saveManifest(sourceConfig.outDir, newManifest);
|
|
726
659
|
} else {
|
|
727
660
|
console.log(
|
|
728
661
|
`š Keeping original URLs - not saving files locally for ${sourceName}`
|
|
@@ -749,7 +682,11 @@ async function main() {
|
|
|
749
682
|
|
|
750
683
|
// Generate and write combined llms.txt to top-level outDir
|
|
751
684
|
if (allSources.length > 0) {
|
|
752
|
-
const combinedLlmsTxt = generateCombinedLlmsTxt(
|
|
685
|
+
const combinedLlmsTxt = generateCombinedLlmsTxt(
|
|
686
|
+
config.description,
|
|
687
|
+
config.details,
|
|
688
|
+
allSources
|
|
689
|
+
);
|
|
753
690
|
const combinedLlmsTxtPath = path.join(config.outDir, "llms.txt");
|
|
754
691
|
fs.writeFileSync(combinedLlmsTxtPath, combinedLlmsTxt);
|
|
755
692
|
console.log(`\nš Generated combined llms.txt: ${combinedLlmsTxtPath}`);
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "extract-from-sitemap",
|
|
3
3
|
"bin": "cli.js",
|
|
4
|
-
"version": "0.0.
|
|
4
|
+
"version": "0.0.14",
|
|
5
5
|
"main": "mod.js",
|
|
6
6
|
"description": "A module and CLI that allows extracting all pages from a sitemap into markdown and a llms.txt, using Parallel.ai APIs.",
|
|
7
7
|
"files": [
|