extract-from-sitemap 0.0.13 → 0.0.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/cli.js +19 -85
  2. package/package.json +1 -1
package/cli.js CHANGED
@@ -21,16 +21,12 @@ const { extractFromSitemap } = require("./mod.js");
21
21
  */
22
22
  /**
23
23
  * @typedef {Object} Config
24
+ * @property {string} description - Description of the documentation collection
25
+ * @property {string} [details] - Optional additional details about the collection
24
26
  * @property {string} outDir - Top-level output directory for combined llms.txt
25
27
  * @property {SourceConfig[]} sources - Array of source configurations
26
28
  */
27
29
 
28
- /**
29
- * @typedef {Object} Manifest
30
- * @property {string[]} files - List of generated files
31
- * @property {string} timestamp - Timestamp of last generation
32
- */
33
-
34
30
  const CREDENTIALS_DIR = path.join(os.homedir(), ".llmtext");
35
31
  const API_KEY_FILE = path.join(CREDENTIALS_DIR, "api-key");
36
32
 
@@ -218,6 +214,9 @@ async function loadConfig() {
218
214
  JSON.stringify(
219
215
  {
220
216
  $schema: "https://extract.llmtext.com/llmtext.schema.json",
217
+ description: "Combined documentation from multiple sources",
218
+ details:
219
+ "This collection includes API documentation, guides, and references.",
221
220
  outDir: "./docs",
222
221
  sources: [
223
222
  {
@@ -274,6 +273,7 @@ async function loadConfig() {
274
273
  const config = JSON.parse(fs.readFileSync(configPath, "utf8"));
275
274
 
276
275
  // Validate required fields
276
+ if (!config.description) throw new Error("description is required");
277
277
  if (!config.outDir) throw new Error("outDir is required");
278
278
  if (!Array.isArray(config.sources))
279
279
  throw new Error("sources must be an array");
@@ -416,59 +416,6 @@ async function getApiKey() {
416
416
  return newApiKey;
417
417
  }
418
418
 
419
- /**
420
- * Load manifest file
421
- * @param {string} outDir - Output directory
422
- * @returns {Manifest} The manifest object
423
- */
424
- function loadManifest(outDir) {
425
- const manifestPath = path.join(outDir, "llmtext-manifest.json");
426
-
427
- if (!fs.existsSync(manifestPath)) {
428
- return { files: [], timestamp: new Date().toISOString() };
429
- }
430
-
431
- try {
432
- return JSON.parse(fs.readFileSync(manifestPath, "utf8"));
433
- } catch {
434
- return { files: [], timestamp: new Date().toISOString() };
435
- }
436
- }
437
-
438
- /**
439
- * Save manifest file
440
- * @param {string} outDir - Output directory
441
- * @param {Manifest} manifest - The manifest to save
442
- */
443
- function saveManifest(outDir, manifest) {
444
- const manifestPath = path.join(outDir, "llmtext-manifest.json");
445
- fs.writeFileSync(manifestPath, JSON.stringify(manifest, null, 2));
446
- }
447
-
448
- /**
449
- * Clean up old files that are no longer generated
450
- * @param {string} outDir - Output directory
451
- * @param {string[]} currentFiles - Currently generated files
452
- * @param {string[]} previousFiles - Previously generated files
453
- */
454
- function cleanupOldFiles(outDir, currentFiles, previousFiles) {
455
- const filesToRemove = previousFiles.filter(
456
- (file) => !currentFiles.includes(file)
457
- );
458
-
459
- for (const file of filesToRemove) {
460
- const filePath = path.join(outDir, file);
461
- try {
462
- if (fs.existsSync(filePath)) {
463
- fs.rmSync(filePath);
464
- console.log(`šŸ—‘ļø Removed old file: ${file}`);
465
- }
466
- } catch (error) {
467
- console.warn(`āš ļø Could not remove ${file}:`, error.message);
468
- }
469
- }
470
- }
471
-
472
419
  /**
473
420
  * Process custom URLs through extraction API
474
421
  * @param {Array<{title: string, description: string, filename: string, url: string}>} customUrls - Custom URLs to process
@@ -546,12 +493,17 @@ function getPathPrefix(topLevelOutDir, sourceOutDir) {
546
493
 
547
494
  /**
548
495
  * Generate combined llms.txt from all sources
496
+ * @param {string} description - Top-level description
497
+ * @param {string} [details] - Optional top-level details
549
498
  * @param {Array<{title: string, files: Record<string, any>, keepOriginalUrls?: boolean, pathPrefix: string}>} allSources - All processed sources
550
499
  * @returns {string} Combined llms.txt content
551
500
  */
552
- function generateCombinedLlmsTxt(allSources) {
553
- let combinedTxt =
554
- "# Documentation Collection\n\n> Combined documentation from multiple sources\n\n";
501
+ function generateCombinedLlmsTxt(description, details, allSources) {
502
+ let combinedTxt = `# Documentation Collection\n\n> ${description}\n\n`;
503
+
504
+ if (details) {
505
+ combinedTxt += `${details}\n\n`;
506
+ }
555
507
 
556
508
  for (const source of allSources) {
557
509
  combinedTxt += `## ${source.title}\n\n`;
@@ -641,11 +593,6 @@ async function main() {
641
593
  fs.mkdirSync(sourceConfig.outDir, { recursive: true });
642
594
  }
643
595
 
644
- // Load previous manifest for this source (only if we have an outDir and not keeping original URLs)
645
- const previousManifest = !sourceConfig.keepOriginalUrls
646
- ? loadManifest(sourceConfig.outDir)
647
- : { files: [], timestamp: new Date().toISOString() };
648
- const currentFiles = [];
649
596
  let sourceFiles = {};
650
597
 
651
598
  try {
@@ -702,7 +649,6 @@ async function main() {
702
649
 
703
650
  fs.mkdirSync(fileDir, { recursive: true });
704
651
  fs.writeFileSync(fullFilePath, file.content);
705
- currentFiles.push(filename);
706
652
 
707
653
  console.log(
708
654
  `šŸ“ Wrote: ${path.join(sourceConfig.outDir, filename)} (${
@@ -710,22 +656,6 @@ async function main() {
710
656
  } tokens)`
711
657
  );
712
658
  }
713
-
714
- // Clean up old files for this source
715
- if (previousManifest.files.length > 0) {
716
- cleanupOldFiles(
717
- sourceConfig.outDir,
718
- currentFiles,
719
- previousManifest.files
720
- );
721
- }
722
-
723
- // Save manifest for this source
724
- const newManifest = {
725
- files: currentFiles,
726
- timestamp: new Date().toISOString(),
727
- };
728
- saveManifest(sourceConfig.outDir, newManifest);
729
659
  } else {
730
660
  console.log(
731
661
  `šŸ“‹ Keeping original URLs - not saving files locally for ${sourceName}`
@@ -752,7 +682,11 @@ async function main() {
752
682
 
753
683
  // Generate and write combined llms.txt to top-level outDir
754
684
  if (allSources.length > 0) {
755
- const combinedLlmsTxt = generateCombinedLlmsTxt(allSources);
685
+ const combinedLlmsTxt = generateCombinedLlmsTxt(
686
+ config.description,
687
+ config.details,
688
+ allSources
689
+ );
756
690
  const combinedLlmsTxtPath = path.join(config.outDir, "llms.txt");
757
691
  fs.writeFileSync(combinedLlmsTxtPath, combinedLlmsTxt);
758
692
  console.log(`\nšŸ“‹ Generated combined llms.txt: ${combinedLlmsTxtPath}`);
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "extract-from-sitemap",
3
3
  "bin": "cli.js",
4
- "version": "0.0.13",
4
+ "version": "0.0.14",
5
5
  "main": "mod.js",
6
6
  "description": "A module and CLI that allows extracting all pages from a sitemap into markdown and a llms.txt, using Parallel.ai APIs.",
7
7
  "files": [