extract-from-sitemap 0.0.5 → 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/cli.js +193 -91
  2. package/package.json +1 -1
package/cli.js CHANGED
@@ -10,16 +10,17 @@ const os = require("os");
10
10
  const { extractFromSitemap } = require("./mod.js");
11
11
 
12
12
  /**
13
- * @typedef {Object} OriginConfig
14
- * @property {string} origin - The origin URL to process
15
- * @property {boolean} forceExtract - Whether to force extraction for this origin
13
+ * @typedef {Object} SourceConfig
14
+ * @property {string} [origin] - The origin URL to process (optional)
15
+ * @property {boolean} forceExtract - Whether to force extraction for this source
16
+ * @property {string} outDir - Output directory for this source's extracted files
17
+ * @property {Array<{title: string, description: string, url: string}>} [customUrls] - Custom URLs to extract for this source
16
18
  */
17
19
 
18
20
  /**
19
21
  * @typedef {Object} Config
20
- * @property {string} outDir - Output directory for extracted files
21
- * @property {OriginConfig[]} origins - Array of origin configurations
22
- * @property {Array<{title: string, description: string, url: string}>} customUrls - Custom URLs to extract
22
+ * @property {string} outDir - Top-level output directory for combined llms.txt
23
+ * @property {SourceConfig[]} sources - Array of source configurations
23
24
  * @property {boolean} keepOriginalUrls - Whether to keep original URL structure
24
25
  */
25
26
 
@@ -217,11 +218,29 @@ async function loadConfig() {
217
218
  {
218
219
  $schema: "https://extract.llmtext.com/llmtext.schema.json",
219
220
  outDir: "./docs",
220
- origins: [
221
- { origin: "https://docs.parallel.ai", forceExtract: false },
222
- { origin: "https://parallel.ai", forceExtract: true },
221
+ sources: [
222
+ {
223
+ origin: "https://docs.parallel.ai",
224
+ forceExtract: false,
225
+ outDir: "./docs/parallel-docs",
226
+ },
227
+ {
228
+ origin: "https://parallel.ai",
229
+ forceExtract: true,
230
+ outDir: "./docs/parallel-main",
231
+ },
232
+ {
233
+ forceExtract: true,
234
+ outDir: "./docs/custom",
235
+ customUrls: [
236
+ {
237
+ title: "Custom Page",
238
+ description: "A custom page to extract",
239
+ url: "https://example.com/page",
240
+ },
241
+ ],
242
+ },
223
243
  ],
224
- customUrls: [],
225
244
  keepOriginalUrls: false,
226
245
  },
227
246
  null,
@@ -236,26 +255,39 @@ async function loadConfig() {
236
255
 
237
256
  // Validate required fields
238
257
  if (!config.outDir) throw new Error("outDir is required");
239
- if (!Array.isArray(config.origins))
240
- throw new Error("origins must be an array");
258
+ if (!Array.isArray(config.sources))
259
+ throw new Error("sources must be an array");
241
260
 
242
- // Validate origin objects
243
- for (const [index, originConfig] of config.origins.entries()) {
244
- if (typeof originConfig !== "object" || originConfig === null) {
245
- throw new Error(`origins[${index}] must be an object`);
261
+ // Validate source objects
262
+ for (const [index, sourceConfig] of config.sources.entries()) {
263
+ if (typeof sourceConfig !== "object" || sourceConfig === null) {
264
+ throw new Error(`sources[${index}] must be an object`);
246
265
  }
247
- if (!originConfig.origin) {
248
- throw new Error(`origins[${index}].origin is required`);
266
+ if (!sourceConfig.outDir) {
267
+ throw new Error(`sources[${index}].outDir is required`);
249
268
  }
250
- if (typeof originConfig.forceExtract !== "boolean") {
251
- throw new Error(`origins[${index}].forceExtract must be a boolean`);
269
+ if (typeof sourceConfig.forceExtract !== "boolean") {
270
+ throw new Error(`sources[${index}].forceExtract must be a boolean`);
271
+ }
272
+ // Either origin or customUrls must be provided
273
+ if (
274
+ !sourceConfig.origin &&
275
+ (!sourceConfig.customUrls || sourceConfig.customUrls.length === 0)
276
+ ) {
277
+ throw new Error(
278
+ `sources[${index}] must have either origin or customUrls`
279
+ );
252
280
  }
253
281
  }
254
282
 
255
283
  // Set defaults
256
- config.customUrls = config.customUrls || [];
257
284
  config.keepOriginalUrls = config.keepOriginalUrls ?? false;
258
285
 
286
+ // Set default customUrls for each source
287
+ for (const sourceConfig of config.sources) {
288
+ sourceConfig.customUrls = sourceConfig.customUrls || [];
289
+ }
290
+
259
291
  return config;
260
292
  } catch (error) {
261
293
  console.error("āŒ Error reading llmtext.json:", error.message);
@@ -444,6 +476,47 @@ async function processCustomUrls(customUrls, apiKey) {
444
476
  return files;
445
477
  }
446
478
 
479
+ /**
480
+ * Generate combined llms.txt from all sources
481
+ * @param {Array<{sourceName: string, files: Record<string, any>, origin?: string}>} allSources - All processed sources
482
+ * @returns {string} Combined llms.txt content
483
+ */
484
+ function generateCombinedLlmsTxt(allSources) {
485
+ let combinedTxt =
486
+ "# Documentation Collection\n\n> Combined documentation from multiple sources\n\n";
487
+
488
+ for (const source of allSources) {
489
+ const sourceName = source.origin
490
+ ? new URL(
491
+ source.origin.startsWith("http")
492
+ ? source.origin
493
+ : `https://${source.origin}`
494
+ ).hostname
495
+ : source.sourceName;
496
+
497
+ combinedTxt += `## ${sourceName}\n\n`;
498
+
499
+ // Sort files by path for consistent ordering
500
+ const sortedFiles = Object.entries(source.files)
501
+ .filter(([path]) => path !== "/llms.txt")
502
+ .sort(([a], [b]) => a.localeCompare(b));
503
+
504
+ for (const [path, file] of sortedFiles) {
505
+ if (file.content || file.title) {
506
+ const title = file.title || path.replace(".md", "");
507
+ const description = file.description ? `: ${file.description}` : "";
508
+ combinedTxt += `- [${title}](${path.replace(".md", "")}) (${
509
+ file.tokens
510
+ } tokens)${description}\n`;
511
+ }
512
+ }
513
+
514
+ combinedTxt += "\n";
515
+ }
516
+
517
+ return combinedTxt;
518
+ }
519
+
447
520
  /**
448
521
  * Clear stored API key credentials
449
522
  */
@@ -477,114 +550,143 @@ async function main() {
477
550
  const config = await loadConfig();
478
551
  const apiKey = await getApiKey();
479
552
 
480
- // Ensure output directory exists
553
+ // Ensure top-level output directory exists
481
554
  fs.mkdirSync(config.outDir, { recursive: true });
482
555
 
483
- // Load previous manifest
484
- const previousManifest = loadManifest(config.outDir);
485
- const currentFiles = [];
486
-
556
+ const allSources = [];
487
557
  let totalTokens = 0;
488
558
  let totalPages = 0;
489
559
  let totalErrors = 0;
490
560
 
491
- // Process each origin with its own forceExtract setting
492
- for (const originConfig of config.origins) {
561
+ // Process each source
562
+ for (const [sourceIndex, sourceConfig] of config.sources.entries()) {
563
+ const sourceName = sourceConfig.origin
564
+ ? `source ${sourceIndex + 1} (${sourceConfig.origin})`
565
+ : `source ${sourceIndex + 1} (custom URLs)`;
566
+
493
567
  console.log(
494
- `\n🌐 Processing origin: ${originConfig.origin} (forceExtract: ${originConfig.forceExtract})`
568
+ `\n🌐 Processing ${sourceName} (forceExtract: ${sourceConfig.forceExtract})`
495
569
  );
496
570
 
571
+ // Ensure source output directory exists
572
+ fs.mkdirSync(sourceConfig.outDir, { recursive: true });
573
+
574
+ // Load previous manifest for this source
575
+ const previousManifest = loadManifest(sourceConfig.outDir);
576
+ const currentFiles = [];
577
+ let sourceFiles = {};
578
+
497
579
  try {
498
- const result = await extractFromSitemap(
499
- originConfig.origin,
500
- originConfig.forceExtract,
501
- apiKey
502
- );
580
+ // Process origin if provided
581
+ if (sourceConfig.origin) {
582
+ const result = await extractFromSitemap(
583
+ sourceConfig.origin,
584
+ sourceConfig.forceExtract,
585
+ apiKey
586
+ );
587
+
588
+ console.log(
589
+ `āœ… Extracted ${result.totalPages} pages with ${result.totalTokens} tokens`
590
+ );
591
+ if (result.errors > 0) {
592
+ console.log(`āš ļø ${result.errors} errors occurred`);
593
+ }
503
594
 
504
- console.log(
505
- `āœ… Extracted ${result.totalPages} pages with ${result.totalTokens} tokens`
506
- );
507
- if (result.errors > 0) {
508
- console.log(`āš ļø ${result.errors} errors occurred`);
595
+ sourceFiles = result.files;
596
+ totalTokens += result.totalTokens;
597
+ totalPages += result.totalPages;
598
+ totalErrors += result.errors;
509
599
  }
510
600
 
511
- // Write files to disk
512
- for (const [filePath, file] of Object.entries(result.files)) {
601
+ // Process custom URLs for this source
602
+ if (sourceConfig.customUrls && sourceConfig.customUrls.length > 0) {
603
+ console.log(
604
+ `šŸ“‹ Processing ${sourceConfig.customUrls.length} custom URLs for this source...`
605
+ );
606
+ const customFiles = await processCustomUrls(
607
+ sourceConfig.customUrls,
608
+ apiKey
609
+ );
610
+
611
+ // Merge custom files with sitemap files
612
+ sourceFiles = { ...sourceFiles, ...customFiles };
613
+
614
+ for (const file of Object.values(customFiles)) {
615
+ totalTokens += file.tokens;
616
+ totalPages++;
617
+ }
618
+ }
619
+
620
+ // Write files to source directory
621
+ for (const [filePath, file] of Object.entries(sourceFiles)) {
513
622
  let filename = filePath;
514
623
 
515
- if (!config.keepOriginalUrls) {
516
- // Create domain-specific subdirectory
517
- const domain = new URL(
518
- originConfig.origin.startsWith("http")
519
- ? originConfig.origin
520
- : `https://${originConfig.origin}`
521
- ).hostname;
522
- const domainDir = path.join(config.outDir, domain);
523
- fs.mkdirSync(domainDir, { recursive: true });
524
- filename = path.join(
525
- domain,
526
- filePath.startsWith("/") ? filePath.slice(1) : filePath
527
- );
528
- } else {
624
+ if (!config.keepOriginalUrls && sourceConfig.origin) {
625
+ // Use relative path within source directory
626
+ filename = filePath.startsWith("/") ? filePath.slice(1) : filePath;
627
+ } else if (!sourceConfig.origin) {
628
+ // For custom URL sources, use simple filename
529
629
  filename = filePath.startsWith("/") ? filePath.slice(1) : filePath;
530
630
  }
531
631
 
532
- const fullFilePath = path.join(config.outDir, filename);
632
+ const fullFilePath = path.join(sourceConfig.outDir, filename);
533
633
  const fileDir = path.dirname(fullFilePath);
534
634
 
535
635
  fs.mkdirSync(fileDir, { recursive: true });
536
636
  fs.writeFileSync(fullFilePath, file.content);
537
637
  currentFiles.push(filename);
538
638
 
539
- console.log(`šŸ“ Wrote: ${filename} (${file.tokens} tokens)`);
639
+ console.log(
640
+ `šŸ“ Wrote: ${path.join(sourceConfig.outDir, filename)} (${
641
+ file.tokens
642
+ } tokens)`
643
+ );
644
+ }
645
+
646
+ // Clean up old files for this source
647
+ if (previousManifest.files.length > 0) {
648
+ cleanupOldFiles(
649
+ sourceConfig.outDir,
650
+ currentFiles,
651
+ previousManifest.files
652
+ );
540
653
  }
541
654
 
542
- totalTokens += result.totalTokens;
543
- totalPages += result.totalPages;
544
- totalErrors += result.errors;
655
+ // Save manifest for this source
656
+ const newManifest = {
657
+ files: currentFiles,
658
+ timestamp: new Date().toISOString(),
659
+ };
660
+ saveManifest(sourceConfig.outDir, newManifest);
661
+
662
+ // Add to all sources for combined llms.txt
663
+ allSources.push({
664
+ sourceName: `Source ${sourceIndex + 1}`,
665
+ origin: sourceConfig.origin,
666
+ files: sourceFiles,
667
+ });
545
668
  } catch (error) {
546
- console.error(
547
- `āŒ Error processing ${originConfig.origin}:`,
548
- error.message
549
- );
669
+ console.error(`āŒ Error processing ${sourceName}:`, error.message);
550
670
  totalErrors++;
551
671
  }
552
672
  }
553
673
 
554
- // Process custom URLs
555
- if (config.customUrls.length > 0) {
556
- console.log(`\nšŸ“‹ Processing ${config.customUrls.length} custom URLs...`);
557
- const customFiles = await processCustomUrls(config.customUrls, apiKey);
558
-
559
- for (const [filename, file] of Object.entries(customFiles)) {
560
- const filePath = path.join(config.outDir, filename);
561
- fs.writeFileSync(filePath, file.content);
562
- currentFiles.push(filename);
563
- totalTokens += file.tokens;
564
- totalPages++;
565
-
566
- console.log(`šŸ“ Wrote: ${filename} (${file.tokens} tokens)`);
567
- }
568
- }
569
-
570
- // Clean up old files
571
- if (previousManifest.files.length > 0) {
572
- cleanupOldFiles(config.outDir, currentFiles, previousManifest.files);
674
+ // Generate and write combined llms.txt to top-level outDir
675
+ if (allSources.length > 0) {
676
+ const combinedLlmsTxt = generateCombinedLlmsTxt(allSources);
677
+ const combinedLlmsTxtPath = path.join(config.outDir, "llms.txt");
678
+ fs.writeFileSync(combinedLlmsTxtPath, combinedLlmsTxt);
679
+ console.log(`\nšŸ“‹ Generated combined llms.txt: ${combinedLlmsTxtPath}`);
573
680
  }
574
681
 
575
- // Save new manifest
576
- const newManifest = {
577
- files: currentFiles,
578
- timestamp: new Date().toISOString(),
579
- };
580
- saveManifest(config.outDir, newManifest);
581
-
582
682
  console.log("\n✨ Extraction completed!");
583
683
  console.log(`šŸ“Š Total: ${totalPages} pages, ${totalTokens} tokens`);
584
684
  if (totalErrors > 0) {
585
685
  console.log(`āš ļø Errors: ${totalErrors}`);
586
686
  }
587
- console.log(`šŸ“ Output directory: ${path.resolve(config.outDir)}`);
687
+ console.log(
688
+ `šŸ“ Top-level output directory: ${path.resolve(config.outDir)}`
689
+ );
588
690
  console.log("\nšŸ’” Use --clear-credentials to remove stored API key");
589
691
  } catch (error) {
590
692
  console.error("šŸ’„ Fatal error:", error.message);
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "extract-from-sitemap",
3
3
  "bin": "cli.js",
4
- "version": "0.0.5",
4
+ "version": "0.0.6",
5
5
  "main": "mod.js",
6
6
  "description": "A module and CLI that allows extracting all pages from a sitemap into markdown and a llms.txt, using Parallel.ai APIs.",
7
7
  "files": [