extract-from-sitemap 0.0.12 → 0.0.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. package/README.md +18 -0
  2. package/cli.js +23 -86
  3. package/package.json +1 -1
package/README.md ADDED
@@ -0,0 +1,18 @@
1
+ This repo allows you to create a static markdown bundle based on one or multiple sources. The sources must either have a functional and complete sitemap, or should specify custom urls to be extracted.
2
+
3
+ ## Step by Step Guide
4
+
5
+ 1. Create a `llmtext.json` file in the root of your project. This is where you define your sources to be extracted from. For an example combining multiple sources, see [this example](https://github.com/janwilmake/parallel-llmtext/blob/main/llmtext.json).
6
+ 2. Run `npx extract-from-sitemap` (or add it to your `package.json` scripts, [like this](https://github.com/janwilmake/parallel-llmtext/blob/main/package.json))
7
+ 3. Set up CI/CD in your repo to automatically update your extracted static files as often as needed. **Example coming soon**
8
+ 4. Use an agent-rewriter such as [next-agent-rewriter](../next-agent-rewriter) to rewrite agent requests to the appropriate static markdown files. In addition, it's best practice to add a link in your html to show the markdown variant is available, like this: `<link rel="alternate" type="text/markdown" href="{path}.md" title="Docs" />`
9
+
10
+ ## Known limitations
11
+
12
+ This library is in active development. Known limitations:
13
+
14
+ - Does not work for nested sitemaps
15
+ - Does not work on sitemaps that are too large
16
+ - Example to make it recurring is still missing
17
+
18
+ I am working on addressing these issues.
package/cli.js CHANGED
@@ -21,16 +21,12 @@ const { extractFromSitemap } = require("./mod.js");
21
21
  */
22
22
  /**
23
23
  * @typedef {Object} Config
24
+ * @property {string} description - Description of the documentation collection
25
+ * @property {string} [details] - Optional additional details about the collection
24
26
  * @property {string} outDir - Top-level output directory for combined llms.txt
25
27
  * @property {SourceConfig[]} sources - Array of source configurations
26
28
  */
27
29
 
28
- /**
29
- * @typedef {Object} Manifest
30
- * @property {string[]} files - List of generated files
31
- * @property {string} timestamp - Timestamp of last generation
32
- */
33
-
34
30
  const CREDENTIALS_DIR = path.join(os.homedir(), ".llmtext");
35
31
  const API_KEY_FILE = path.join(CREDENTIALS_DIR, "api-key");
36
32
 
@@ -218,6 +214,9 @@ async function loadConfig() {
218
214
  JSON.stringify(
219
215
  {
220
216
  $schema: "https://extract.llmtext.com/llmtext.schema.json",
217
+ description: "Combined documentation from multiple sources",
218
+ details:
219
+ "This collection includes API documentation, guides, and references.",
221
220
  outDir: "./docs",
222
221
  sources: [
223
222
  {
@@ -274,6 +273,7 @@ async function loadConfig() {
274
273
  const config = JSON.parse(fs.readFileSync(configPath, "utf8"));
275
274
 
276
275
  // Validate required fields
276
+ if (!config.description) throw new Error("description is required");
277
277
  if (!config.outDir) throw new Error("outDir is required");
278
278
  if (!Array.isArray(config.sources))
279
279
  throw new Error("sources must be an array");
@@ -416,59 +416,6 @@ async function getApiKey() {
416
416
  return newApiKey;
417
417
  }
418
418
 
419
- /**
420
- * Load manifest file
421
- * @param {string} outDir - Output directory
422
- * @returns {Manifest} The manifest object
423
- */
424
- function loadManifest(outDir) {
425
- const manifestPath = path.join(outDir, "llmtext-manifest.json");
426
-
427
- if (!fs.existsSync(manifestPath)) {
428
- return { files: [], timestamp: new Date().toISOString() };
429
- }
430
-
431
- try {
432
- return JSON.parse(fs.readFileSync(manifestPath, "utf8"));
433
- } catch {
434
- return { files: [], timestamp: new Date().toISOString() };
435
- }
436
- }
437
-
438
- /**
439
- * Save manifest file
440
- * @param {string} outDir - Output directory
441
- * @param {Manifest} manifest - The manifest to save
442
- */
443
- function saveManifest(outDir, manifest) {
444
- const manifestPath = path.join(outDir, "llmtext-manifest.json");
445
- fs.writeFileSync(manifestPath, JSON.stringify(manifest, null, 2));
446
- }
447
-
448
- /**
449
- * Clean up old files that are no longer generated
450
- * @param {string} outDir - Output directory
451
- * @param {string[]} currentFiles - Currently generated files
452
- * @param {string[]} previousFiles - Previously generated files
453
- */
454
- function cleanupOldFiles(outDir, currentFiles, previousFiles) {
455
- const filesToRemove = previousFiles.filter(
456
- (file) => !currentFiles.includes(file)
457
- );
458
-
459
- for (const file of filesToRemove) {
460
- const filePath = path.join(outDir, file);
461
- try {
462
- if (fs.existsSync(filePath)) {
463
- fs.rmSync(filePath);
464
- console.log(`šŸ—‘ļø Removed old file: ${file}`);
465
- }
466
- } catch (error) {
467
- console.warn(`āš ļø Could not remove ${file}:`, error.message);
468
- }
469
- }
470
- }
471
-
472
419
  /**
473
420
  * Process custom URLs through extraction API
474
421
  * @param {Array<{title: string, description: string, filename: string, url: string}>} customUrls - Custom URLs to process
@@ -546,12 +493,17 @@ function getPathPrefix(topLevelOutDir, sourceOutDir) {
546
493
 
547
494
  /**
548
495
  * Generate combined llms.txt from all sources
496
+ * @param {string} description - Top-level description
497
+ * @param {string} [details] - Optional top-level details
549
498
  * @param {Array<{title: string, files: Record<string, any>, keepOriginalUrls?: boolean, pathPrefix: string}>} allSources - All processed sources
550
499
  * @returns {string} Combined llms.txt content
551
500
  */
552
- function generateCombinedLlmsTxt(allSources) {
553
- let combinedTxt =
554
- "# Documentation Collection\n\n> Combined documentation from multiple sources\n\n";
501
+ function generateCombinedLlmsTxt(description, details, allSources) {
502
+ let combinedTxt = `# Documentation Collection\n\n> ${description}\n\n`;
503
+
504
+ if (details) {
505
+ combinedTxt += `${details}\n\n`;
506
+ }
555
507
 
556
508
  for (const source of allSources) {
557
509
  combinedTxt += `## ${source.title}\n\n`;
@@ -574,7 +526,10 @@ function generateCombinedLlmsTxt(allSources) {
574
526
  link = source.pathPrefix + (path.startsWith("/") ? path : "/" + path);
575
527
  }
576
528
 
577
- combinedTxt += `- [${title}](${link}) (${file.tokens} tokens)${description}\n`;
529
+ combinedTxt += `- [${title}](${link}): ${description.replaceAll(
530
+ "\n",
531
+ " "
532
+ )}\n`;
578
533
  }
579
534
  }
580
535
 
@@ -638,11 +593,6 @@ async function main() {
638
593
  fs.mkdirSync(sourceConfig.outDir, { recursive: true });
639
594
  }
640
595
 
641
- // Load previous manifest for this source (only if we have an outDir and not keeping original URLs)
642
- const previousManifest = !sourceConfig.keepOriginalUrls
643
- ? loadManifest(sourceConfig.outDir)
644
- : { files: [], timestamp: new Date().toISOString() };
645
- const currentFiles = [];
646
596
  let sourceFiles = {};
647
597
 
648
598
  try {
@@ -699,7 +649,6 @@ async function main() {
699
649
 
700
650
  fs.mkdirSync(fileDir, { recursive: true });
701
651
  fs.writeFileSync(fullFilePath, file.content);
702
- currentFiles.push(filename);
703
652
 
704
653
  console.log(
705
654
  `šŸ“ Wrote: ${path.join(sourceConfig.outDir, filename)} (${
@@ -707,22 +656,6 @@ async function main() {
707
656
  } tokens)`
708
657
  );
709
658
  }
710
-
711
- // Clean up old files for this source
712
- if (previousManifest.files.length > 0) {
713
- cleanupOldFiles(
714
- sourceConfig.outDir,
715
- currentFiles,
716
- previousManifest.files
717
- );
718
- }
719
-
720
- // Save manifest for this source
721
- const newManifest = {
722
- files: currentFiles,
723
- timestamp: new Date().toISOString(),
724
- };
725
- saveManifest(sourceConfig.outDir, newManifest);
726
659
  } else {
727
660
  console.log(
728
661
  `šŸ“‹ Keeping original URLs - not saving files locally for ${sourceName}`
@@ -749,7 +682,11 @@ async function main() {
749
682
 
750
683
  // Generate and write combined llms.txt to top-level outDir
751
684
  if (allSources.length > 0) {
752
- const combinedLlmsTxt = generateCombinedLlmsTxt(allSources);
685
+ const combinedLlmsTxt = generateCombinedLlmsTxt(
686
+ config.description,
687
+ config.details,
688
+ allSources
689
+ );
753
690
  const combinedLlmsTxtPath = path.join(config.outDir, "llms.txt");
754
691
  fs.writeFileSync(combinedLlmsTxtPath, combinedLlmsTxt);
755
692
  console.log(`\nšŸ“‹ Generated combined llms.txt: ${combinedLlmsTxtPath}`);
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "extract-from-sitemap",
3
3
  "bin": "cli.js",
4
- "version": "0.0.12",
4
+ "version": "0.0.14",
5
5
  "main": "mod.js",
6
6
  "description": "A module and CLI that allows extracting all pages from a sitemap into markdown and a llms.txt, using Parallel.ai APIs.",
7
7
  "files": [