extract-from-sitemap 0.0.13 → 0.0.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/cli.js +23 -85
  2. package/package.json +1 -1
package/cli.js CHANGED
@@ -21,16 +21,13 @@ const { extractFromSitemap } = require("./mod.js");
21
21
  */
22
22
  /**
23
23
  * @typedef {Object} Config
24
+ * @property {string} title - Title of your document
25
+ * @property {string} description - Description of the documentation collection
26
+ * @property {string} [details] - Optional additional details about the collection
24
27
  * @property {string} outDir - Top-level output directory for combined llms.txt
25
28
  * @property {SourceConfig[]} sources - Array of source configurations
26
29
  */
27
30
 
28
- /**
29
- * @typedef {Object} Manifest
30
- * @property {string[]} files - List of generated files
31
- * @property {string} timestamp - Timestamp of last generation
32
- */
33
-
34
31
  const CREDENTIALS_DIR = path.join(os.homedir(), ".llmtext");
35
32
  const API_KEY_FILE = path.join(CREDENTIALS_DIR, "api-key");
36
33
 
@@ -218,6 +215,10 @@ async function loadConfig() {
218
215
  JSON.stringify(
219
216
  {
220
217
  $schema: "https://extract.llmtext.com/llmtext.schema.json",
218
+ title: "Parallel Web Systems",
219
+ description: "Combined documentation from multiple sources",
220
+ details:
221
+ "This collection includes API documentation, guides, and references.",
221
222
  outDir: "./docs",
222
223
  sources: [
223
224
  {
@@ -274,6 +275,7 @@ async function loadConfig() {
274
275
  const config = JSON.parse(fs.readFileSync(configPath, "utf8"));
275
276
 
276
277
  // Validate required fields
278
+ if (!config.description) throw new Error("description is required");
277
279
  if (!config.outDir) throw new Error("outDir is required");
278
280
  if (!Array.isArray(config.sources))
279
281
  throw new Error("sources must be an array");
@@ -416,59 +418,6 @@ async function getApiKey() {
416
418
  return newApiKey;
417
419
  }
418
420
 
419
- /**
420
- * Load manifest file
421
- * @param {string} outDir - Output directory
422
- * @returns {Manifest} The manifest object
423
- */
424
- function loadManifest(outDir) {
425
- const manifestPath = path.join(outDir, "llmtext-manifest.json");
426
-
427
- if (!fs.existsSync(manifestPath)) {
428
- return { files: [], timestamp: new Date().toISOString() };
429
- }
430
-
431
- try {
432
- return JSON.parse(fs.readFileSync(manifestPath, "utf8"));
433
- } catch {
434
- return { files: [], timestamp: new Date().toISOString() };
435
- }
436
- }
437
-
438
- /**
439
- * Save manifest file
440
- * @param {string} outDir - Output directory
441
- * @param {Manifest} manifest - The manifest to save
442
- */
443
- function saveManifest(outDir, manifest) {
444
- const manifestPath = path.join(outDir, "llmtext-manifest.json");
445
- fs.writeFileSync(manifestPath, JSON.stringify(manifest, null, 2));
446
- }
447
-
448
- /**
449
- * Clean up old files that are no longer generated
450
- * @param {string} outDir - Output directory
451
- * @param {string[]} currentFiles - Currently generated files
452
- * @param {string[]} previousFiles - Previously generated files
453
- */
454
- function cleanupOldFiles(outDir, currentFiles, previousFiles) {
455
- const filesToRemove = previousFiles.filter(
456
- (file) => !currentFiles.includes(file)
457
- );
458
-
459
- for (const file of filesToRemove) {
460
- const filePath = path.join(outDir, file);
461
- try {
462
- if (fs.existsSync(filePath)) {
463
- fs.rmSync(filePath);
464
- console.log(`šŸ—‘ļø Removed old file: ${file}`);
465
- }
466
- } catch (error) {
467
- console.warn(`āš ļø Could not remove ${file}:`, error.message);
468
- }
469
- }
470
- }
471
-
472
421
  /**
473
422
  * Process custom URLs through extraction API
474
423
  * @param {Array<{title: string, description: string, filename: string, url: string}>} customUrls - Custom URLs to process
@@ -546,12 +495,18 @@ function getPathPrefix(topLevelOutDir, sourceOutDir) {
546
495
 
547
496
  /**
548
497
  * Generate combined llms.txt from all sources
498
+ * @param {string} title - Top-level title
499
+ * @param {string} description - Top-level description
500
+ * @param {string} [details] - Optional top-level details
549
501
  * @param {Array<{title: string, files: Record<string, any>, keepOriginalUrls?: boolean, pathPrefix: string}>} allSources - All processed sources
550
502
  * @returns {string} Combined llms.txt content
551
503
  */
552
- function generateCombinedLlmsTxt(allSources) {
553
- let combinedTxt =
554
- "# Documentation Collection\n\n> Combined documentation from multiple sources\n\n";
504
+ function generateCombinedLlmsTxt(title, description, details, allSources) {
505
+ let combinedTxt = `# Documentation Collection\n\n> ${description}\n\n`;
506
+
507
+ if (details) {
508
+ combinedTxt += `${details}\n\n`;
509
+ }
555
510
 
556
511
  for (const source of allSources) {
557
512
  combinedTxt += `## ${source.title}\n\n`;
@@ -641,11 +596,6 @@ async function main() {
641
596
  fs.mkdirSync(sourceConfig.outDir, { recursive: true });
642
597
  }
643
598
 
644
- // Load previous manifest for this source (only if we have an outDir and not keeping original URLs)
645
- const previousManifest = !sourceConfig.keepOriginalUrls
646
- ? loadManifest(sourceConfig.outDir)
647
- : { files: [], timestamp: new Date().toISOString() };
648
- const currentFiles = [];
649
599
  let sourceFiles = {};
650
600
 
651
601
  try {
@@ -702,7 +652,6 @@ async function main() {
702
652
 
703
653
  fs.mkdirSync(fileDir, { recursive: true });
704
654
  fs.writeFileSync(fullFilePath, file.content);
705
- currentFiles.push(filename);
706
655
 
707
656
  console.log(
708
657
  `šŸ“ Wrote: ${path.join(sourceConfig.outDir, filename)} (${
@@ -710,22 +659,6 @@ async function main() {
710
659
  } tokens)`
711
660
  );
712
661
  }
713
-
714
- // Clean up old files for this source
715
- if (previousManifest.files.length > 0) {
716
- cleanupOldFiles(
717
- sourceConfig.outDir,
718
- currentFiles,
719
- previousManifest.files
720
- );
721
- }
722
-
723
- // Save manifest for this source
724
- const newManifest = {
725
- files: currentFiles,
726
- timestamp: new Date().toISOString(),
727
- };
728
- saveManifest(sourceConfig.outDir, newManifest);
729
662
  } else {
730
663
  console.log(
731
664
  `šŸ“‹ Keeping original URLs - not saving files locally for ${sourceName}`
@@ -752,7 +685,12 @@ async function main() {
752
685
 
753
686
  // Generate and write combined llms.txt to top-level outDir
754
687
  if (allSources.length > 0) {
755
- const combinedLlmsTxt = generateCombinedLlmsTxt(allSources);
688
+ const combinedLlmsTxt = generateCombinedLlmsTxt(
689
+ config.title,
690
+ config.description,
691
+ config.details,
692
+ allSources
693
+ );
756
694
  const combinedLlmsTxtPath = path.join(config.outDir, "llms.txt");
757
695
  fs.writeFileSync(combinedLlmsTxtPath, combinedLlmsTxt);
758
696
  console.log(`\nšŸ“‹ Generated combined llms.txt: ${combinedLlmsTxtPath}`);
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "extract-from-sitemap",
3
3
  "bin": "cli.js",
4
- "version": "0.0.13",
4
+ "version": "0.0.15",
5
5
  "main": "mod.js",
6
6
  "description": "A module and CLI that allows extracting all pages from a sitemap into markdown and a llms.txt, using Parallel.ai APIs.",
7
7
  "files": [