extract-from-sitemap 0.0.4 → 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/cli.js +207 -82
  2. package/package.json +1 -1
package/cli.js CHANGED
@@ -8,13 +8,20 @@ const http = require("http");
8
8
  const { URL, URLSearchParams } = require("url");
9
9
  const os = require("os");
10
10
  const { extractFromSitemap } = require("./mod.js");
11
+
12
+ /**
13
+ * @typedef {Object} SourceConfig
14
+ * @property {string} [origin] - The origin URL to process (optional)
15
+ * @property {boolean} forceExtract - Whether to force extraction for this source
16
+ * @property {string} outDir - Output directory for this source's extracted files
17
+ * @property {Array<{title: string, description: string, url: string}>} [customUrls] - Custom URLs to extract for this source
18
+ */
19
+
11
20
  /**
12
21
  * @typedef {Object} Config
13
- * @property {string} outDir - Output directory for extracted files
14
- * @property {string[]} origins - Array of origin URLs to process
15
- * @property {Array<{title: string, description: string, url: string}>} customUrls - Custom URLs to extract
22
+ * @property {string} outDir - Top-level output directory for combined llms.txt
23
+ * @property {SourceConfig[]} sources - Array of source configurations
16
24
  * @property {boolean} keepOriginalUrls - Whether to keep original URL structure
17
- * @property {boolean} forceExtract - Whether to force extraction even if files exist
18
25
  */
19
26
 
20
27
  /**
@@ -209,11 +216,32 @@ async function loadConfig() {
209
216
  console.log(
210
217
  JSON.stringify(
211
218
  {
219
+ $schema: "https://extract.llmtext.com/llmtext.schema.json",
212
220
  outDir: "./docs",
213
- origins: ["https://docs.example.com"],
214
- customUrls: [],
221
+ sources: [
222
+ {
223
+ origin: "https://docs.parallel.ai",
224
+ forceExtract: false,
225
+ outDir: "./docs/parallel-docs",
226
+ },
227
+ {
228
+ origin: "https://parallel.ai",
229
+ forceExtract: true,
230
+ outDir: "./docs/parallel-main",
231
+ },
232
+ {
233
+ forceExtract: true,
234
+ outDir: "./docs/custom",
235
+ customUrls: [
236
+ {
237
+ title: "Custom Page",
238
+ description: "A custom page to extract",
239
+ url: "https://example.com/page",
240
+ },
241
+ ],
242
+ },
243
+ ],
215
244
  keepOriginalUrls: false,
216
- forceExtract: false,
217
245
  },
218
246
  null,
219
247
  2
@@ -227,13 +255,38 @@ async function loadConfig() {
227
255
 
228
256
  // Validate required fields
229
257
  if (!config.outDir) throw new Error("outDir is required");
230
- if (!Array.isArray(config.origins))
231
- throw new Error("origins must be an array");
258
+ if (!Array.isArray(config.sources))
259
+ throw new Error("sources must be an array");
260
+
261
+ // Validate source objects
262
+ for (const [index, sourceConfig] of config.sources.entries()) {
263
+ if (typeof sourceConfig !== "object" || sourceConfig === null) {
264
+ throw new Error(`sources[${index}] must be an object`);
265
+ }
266
+ if (!sourceConfig.outDir) {
267
+ throw new Error(`sources[${index}].outDir is required`);
268
+ }
269
+ if (typeof sourceConfig.forceExtract !== "boolean") {
270
+ throw new Error(`sources[${index}].forceExtract must be a boolean`);
271
+ }
272
+ // Either origin or customUrls must be provided
273
+ if (
274
+ !sourceConfig.origin &&
275
+ (!sourceConfig.customUrls || sourceConfig.customUrls.length === 0)
276
+ ) {
277
+ throw new Error(
278
+ `sources[${index}] must have either origin or customUrls`
279
+ );
280
+ }
281
+ }
232
282
 
233
283
  // Set defaults
234
- config.customUrls = config.customUrls || [];
235
284
  config.keepOriginalUrls = config.keepOriginalUrls ?? false;
236
- config.forceExtract = config.forceExtract ?? false;
285
+
286
+ // Set default customUrls for each source
287
+ for (const sourceConfig of config.sources) {
288
+ sourceConfig.customUrls = sourceConfig.customUrls || [];
289
+ }
237
290
 
238
291
  return config;
239
292
  } catch (error) {
@@ -370,10 +423,9 @@ function cleanupOldFiles(outDir, currentFiles, previousFiles) {
370
423
  * Process custom URLs through extraction API
371
424
  * @param {Array<{title: string, description: string, url: string}>} customUrls - Custom URLs to process
372
425
  * @param {string} apiKey - API key for authentication
373
- * @param {boolean} forceExtract - Whether to force extraction
374
426
  * @returns {Promise<Record<string, any>>} Extracted files
375
427
  */
376
- async function processCustomUrls(customUrls, apiKey, forceExtract) {
428
+ async function processCustomUrls(customUrls, apiKey) {
377
429
  const files = {};
378
430
 
379
431
  for (const customUrl of customUrls) {
@@ -424,6 +476,47 @@ async function processCustomUrls(customUrls, apiKey, forceExtract) {
424
476
  return files;
425
477
  }
426
478
 
479
+ /**
480
+ * Generate combined llms.txt from all sources
481
+ * @param {Array<{sourceName: string, files: Record<string, any>, origin?: string}>} allSources - All processed sources
482
+ * @returns {string} Combined llms.txt content
483
+ */
484
+ function generateCombinedLlmsTxt(allSources) {
485
+ let combinedTxt =
486
+ "# Documentation Collection\n\n> Combined documentation from multiple sources\n\n";
487
+
488
+ for (const source of allSources) {
489
+ const sourceName = source.origin
490
+ ? new URL(
491
+ source.origin.startsWith("http")
492
+ ? source.origin
493
+ : `https://${source.origin}`
494
+ ).hostname
495
+ : source.sourceName;
496
+
497
+ combinedTxt += `## ${sourceName}\n\n`;
498
+
499
+ // Sort files by path for consistent ordering
500
+ const sortedFiles = Object.entries(source.files)
501
+ .filter(([path]) => path !== "/llms.txt")
502
+ .sort(([a], [b]) => a.localeCompare(b));
503
+
504
+ for (const [path, file] of sortedFiles) {
505
+ if (file.content || file.title) {
506
+ const title = file.title || path.replace(".md", "");
507
+ const description = file.description ? `: ${file.description}` : "";
508
+ combinedTxt += `- [${title}](${path.replace(".md", "")}) (${
509
+ file.tokens
510
+ } tokens)${description}\n`;
511
+ }
512
+ }
513
+
514
+ combinedTxt += "\n";
515
+ }
516
+
517
+ return combinedTxt;
518
+ }
519
+
427
520
  /**
428
521
  * Clear stored API key credentials
429
522
  */
@@ -457,111 +550,143 @@ async function main() {
457
550
  const config = await loadConfig();
458
551
  const apiKey = await getApiKey();
459
552
 
460
- // Ensure output directory exists
553
+ // Ensure top-level output directory exists
461
554
  fs.mkdirSync(config.outDir, { recursive: true });
462
555
 
463
- // Load previous manifest
464
- const previousManifest = loadManifest(config.outDir);
465
- const currentFiles = [];
466
-
556
+ const allSources = [];
467
557
  let totalTokens = 0;
468
558
  let totalPages = 0;
469
559
  let totalErrors = 0;
470
560
 
471
- // Process each origin
472
- for (const origin of config.origins) {
473
- console.log(`\n🌐 Processing origin: ${origin}`);
561
+ // Process each source
562
+ for (const [sourceIndex, sourceConfig] of config.sources.entries()) {
563
+ const sourceName = sourceConfig.origin
564
+ ? `source ${sourceIndex + 1} (${sourceConfig.origin})`
565
+ : `source ${sourceIndex + 1} (custom URLs)`;
566
+
567
+ console.log(
568
+ `\n🌐 Processing ${sourceName} (forceExtract: ${sourceConfig.forceExtract})`
569
+ );
570
+
571
+ // Ensure source output directory exists
572
+ fs.mkdirSync(sourceConfig.outDir, { recursive: true });
573
+
574
+ // Load previous manifest for this source
575
+ const previousManifest = loadManifest(sourceConfig.outDir);
576
+ const currentFiles = [];
577
+ let sourceFiles = {};
474
578
 
475
579
  try {
476
- const result = await extractFromSitemap(
477
- origin,
478
- config.forceExtract,
479
- apiKey
480
- );
580
+ // Process origin if provided
581
+ if (sourceConfig.origin) {
582
+ const result = await extractFromSitemap(
583
+ sourceConfig.origin,
584
+ sourceConfig.forceExtract,
585
+ apiKey
586
+ );
587
+
588
+ console.log(
589
+ `āœ… Extracted ${result.totalPages} pages with ${result.totalTokens} tokens`
590
+ );
591
+ if (result.errors > 0) {
592
+ console.log(`āš ļø ${result.errors} errors occurred`);
593
+ }
481
594
 
482
- console.log(
483
- `āœ… Extracted ${result.totalPages} pages with ${result.totalTokens} tokens`
484
- );
485
- if (result.errors > 0) {
486
- console.log(`āš ļø ${result.errors} errors occurred`);
595
+ sourceFiles = result.files;
596
+ totalTokens += result.totalTokens;
597
+ totalPages += result.totalPages;
598
+ totalErrors += result.errors;
487
599
  }
488
600
 
489
- // Write files to disk
490
- for (const [filePath, file] of Object.entries(result.files)) {
601
+ // Process custom URLs for this source
602
+ if (sourceConfig.customUrls && sourceConfig.customUrls.length > 0) {
603
+ console.log(
604
+ `šŸ“‹ Processing ${sourceConfig.customUrls.length} custom URLs for this source...`
605
+ );
606
+ const customFiles = await processCustomUrls(
607
+ sourceConfig.customUrls,
608
+ apiKey
609
+ );
610
+
611
+ // Merge custom files with sitemap files
612
+ sourceFiles = { ...sourceFiles, ...customFiles };
613
+
614
+ for (const file of Object.values(customFiles)) {
615
+ totalTokens += file.tokens;
616
+ totalPages++;
617
+ }
618
+ }
619
+
620
+ // Write files to source directory
621
+ for (const [filePath, file] of Object.entries(sourceFiles)) {
491
622
  let filename = filePath;
492
623
 
493
- if (!config.keepOriginalUrls) {
494
- // Create domain-specific subdirectory
495
- const domain = new URL(
496
- origin.startsWith("http") ? origin : `https://${origin}`
497
- ).hostname;
498
- const domainDir = path.join(config.outDir, domain);
499
- fs.mkdirSync(domainDir, { recursive: true });
500
- filename = path.join(
501
- domain,
502
- filePath.startsWith("/") ? filePath.slice(1) : filePath
503
- );
504
- } else {
624
+ if (!config.keepOriginalUrls && sourceConfig.origin) {
625
+ // Use relative path within source directory
626
+ filename = filePath.startsWith("/") ? filePath.slice(1) : filePath;
627
+ } else if (!sourceConfig.origin) {
628
+ // For custom URL sources, use simple filename
505
629
  filename = filePath.startsWith("/") ? filePath.slice(1) : filePath;
506
630
  }
507
631
 
508
- const fullFilePath = path.join(config.outDir, filename);
632
+ const fullFilePath = path.join(sourceConfig.outDir, filename);
509
633
  const fileDir = path.dirname(fullFilePath);
510
634
 
511
635
  fs.mkdirSync(fileDir, { recursive: true });
512
636
  fs.writeFileSync(fullFilePath, file.content);
513
637
  currentFiles.push(filename);
514
638
 
515
- console.log(`šŸ“ Wrote: ${filename} (${file.tokens} tokens)`);
639
+ console.log(
640
+ `šŸ“ Wrote: ${path.join(sourceConfig.outDir, filename)} (${
641
+ file.tokens
642
+ } tokens)`
643
+ );
516
644
  }
517
645
 
518
- totalTokens += result.totalTokens;
519
- totalPages += result.totalPages;
520
- totalErrors += result.errors;
646
+ // Clean up old files for this source
647
+ if (previousManifest.files.length > 0) {
648
+ cleanupOldFiles(
649
+ sourceConfig.outDir,
650
+ currentFiles,
651
+ previousManifest.files
652
+ );
653
+ }
654
+
655
+ // Save manifest for this source
656
+ const newManifest = {
657
+ files: currentFiles,
658
+ timestamp: new Date().toISOString(),
659
+ };
660
+ saveManifest(sourceConfig.outDir, newManifest);
661
+
662
+ // Add to all sources for combined llms.txt
663
+ allSources.push({
664
+ sourceName: `Source ${sourceIndex + 1}`,
665
+ origin: sourceConfig.origin,
666
+ files: sourceFiles,
667
+ });
521
668
  } catch (error) {
522
- console.error(`āŒ Error processing ${origin}:`, error.message);
669
+ console.error(`āŒ Error processing ${sourceName}:`, error.message);
523
670
  totalErrors++;
524
671
  }
525
672
  }
526
673
 
527
- // Process custom URLs
528
- if (config.customUrls.length > 0) {
529
- console.log(`\nšŸ“‹ Processing ${config.customUrls.length} custom URLs...`);
530
- const customFiles = await processCustomUrls(
531
- config.customUrls,
532
- apiKey,
533
- config.forceExtract
534
- );
535
-
536
- for (const [filename, file] of Object.entries(customFiles)) {
537
- const filePath = path.join(config.outDir, filename);
538
- fs.writeFileSync(filePath, file.content);
539
- currentFiles.push(filename);
540
- totalTokens += file.tokens;
541
- totalPages++;
542
-
543
- console.log(`šŸ“ Wrote: ${filename} (${file.tokens} tokens)`);
544
- }
674
+ // Generate and write combined llms.txt to top-level outDir
675
+ if (allSources.length > 0) {
676
+ const combinedLlmsTxt = generateCombinedLlmsTxt(allSources);
677
+ const combinedLlmsTxtPath = path.join(config.outDir, "llms.txt");
678
+ fs.writeFileSync(combinedLlmsTxtPath, combinedLlmsTxt);
679
+ console.log(`\nšŸ“‹ Generated combined llms.txt: ${combinedLlmsTxtPath}`);
545
680
  }
546
681
 
547
- // Clean up old files
548
- if (previousManifest.files.length > 0) {
549
- cleanupOldFiles(config.outDir, currentFiles, previousManifest.files);
550
- }
551
-
552
- // Save new manifest
553
- const newManifest = {
554
- files: currentFiles,
555
- timestamp: new Date().toISOString(),
556
- };
557
- saveManifest(config.outDir, newManifest);
558
-
559
682
  console.log("\n✨ Extraction completed!");
560
683
  console.log(`šŸ“Š Total: ${totalPages} pages, ${totalTokens} tokens`);
561
684
  if (totalErrors > 0) {
562
685
  console.log(`āš ļø Errors: ${totalErrors}`);
563
686
  }
564
- console.log(`šŸ“ Output directory: ${path.resolve(config.outDir)}`);
687
+ console.log(
688
+ `šŸ“ Top-level output directory: ${path.resolve(config.outDir)}`
689
+ );
565
690
  console.log("\nšŸ’” Use --clear-credentials to remove stored API key");
566
691
  } catch (error) {
567
692
  console.error("šŸ’„ Fatal error:", error.message);
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "extract-from-sitemap",
3
3
  "bin": "cli.js",
4
- "version": "0.0.4",
4
+ "version": "0.0.6",
5
5
  "main": "mod.js",
6
6
  "description": "A module and CLI that allows extracting all pages from a sitemap into markdown and a llms.txt, using Parallel.ai APIs.",
7
7
  "files": [