extract-from-sitemap 0.0.5 → 0.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. package/cli.js +244 -106
  2. package/mod.js +6 -51
  3. package/package.json +1 -1
package/cli.js CHANGED
@@ -10,17 +10,19 @@ const os = require("os");
10
10
  const { extractFromSitemap } = require("./mod.js");
11
11
 
12
12
  /**
13
- * @typedef {Object} OriginConfig
14
- * @property {string} origin - The origin URL to process
15
- * @property {boolean} forceExtract - Whether to force extraction for this origin
13
+ * @typedef {Object} SourceConfig
14
+ * @property {string} title - The title for this source
15
+ * @property {string} [origin] - The origin URL to process (optional)
16
+ * @property {string} [outDir] - Output directory for this source's extracted files
17
+ * @property {boolean} [forceExtract] - Whether to force extraction for this source
18
+ * @property {boolean} [keepOriginalUrls] - Whether to keep original URL structure and not save files locally
19
+ * @property {Array<{title: string, description: string, filename: string, url: string}>} [customUrls] - Custom URLs to extract for this source
16
20
  */
17
21
 
18
22
  /**
19
23
  * @typedef {Object} Config
20
- * @property {string} outDir - Output directory for extracted files
21
- * @property {OriginConfig[]} origins - Array of origin configurations
22
- * @property {Array<{title: string, description: string, url: string}>} customUrls - Custom URLs to extract
23
- * @property {boolean} keepOriginalUrls - Whether to keep original URL structure
24
+ * @property {string} outDir - Top-level output directory for combined llms.txt
25
+ * @property {SourceConfig[]} sources - Array of source configurations
24
26
  */
25
27
 
26
28
  /**
@@ -217,12 +219,49 @@ async function loadConfig() {
217
219
  {
218
220
  $schema: "https://extract.llmtext.com/llmtext.schema.json",
219
221
  outDir: "./docs",
220
- origins: [
221
- { origin: "https://docs.parallel.ai", forceExtract: false },
222
- { origin: "https://parallel.ai", forceExtract: true },
222
+ sources: [
223
+ {
224
+ title: "Parallel AI Documentation",
225
+ origin: "https://docs.parallel.ai",
226
+ forceExtract: false,
227
+ outDir: "./docs/parallel-docs",
228
+ keepOriginalUrls: false,
229
+ },
230
+ {
231
+ title: "Parallel AI Website",
232
+ origin: "https://parallel.ai",
233
+ forceExtract: true,
234
+ outDir: "./docs/parallel-main",
235
+ keepOriginalUrls: false,
236
+ },
237
+ {
238
+ title: "Custom Resources",
239
+ forceExtract: true,
240
+ outDir: "./docs/custom",
241
+ keepOriginalUrls: false,
242
+ customUrls: [
243
+ {
244
+ title: "Custom Page",
245
+ description: "A custom page to extract",
246
+ filename: "custom-page",
247
+ url: "https://example.com/page",
248
+ },
249
+ ],
250
+ },
251
+ {
252
+ title: "External References",
253
+ keepOriginalUrls: true,
254
+ forceExtract: false,
255
+ customUrls: [
256
+ {
257
+ title: "External API Guide",
258
+ description: "Third-party API documentation",
259
+ filename: "external-api",
260
+ url: "https://external.com/api-guide",
261
+ },
262
+ ],
263
+ },
223
264
  ],
224
- customUrls: [],
225
- keepOriginalUrls: false,
226
265
  },
227
266
  null,
228
267
  2
@@ -236,25 +275,56 @@ async function loadConfig() {
236
275
 
237
276
  // Validate required fields
238
277
  if (!config.outDir) throw new Error("outDir is required");
239
- if (!Array.isArray(config.origins))
240
- throw new Error("origins must be an array");
278
+ if (!Array.isArray(config.sources))
279
+ throw new Error("sources must be an array");
241
280
 
242
- // Validate origin objects
243
- for (const [index, originConfig] of config.origins.entries()) {
244
- if (typeof originConfig !== "object" || originConfig === null) {
245
- throw new Error(`origins[${index}] must be an object`);
281
+ // Validate source objects
282
+ for (const [index, sourceConfig] of config.sources.entries()) {
283
+ if (typeof sourceConfig !== "object" || sourceConfig === null) {
284
+ throw new Error(`sources[${index}] must be an object`);
246
285
  }
247
- if (!originConfig.origin) {
248
- throw new Error(`origins[${index}].origin is required`);
286
+ if (!sourceConfig.title) {
287
+ throw new Error(`sources[${index}].title is required`);
249
288
  }
250
- if (typeof originConfig.forceExtract !== "boolean") {
251
- throw new Error(`origins[${index}].forceExtract must be a boolean`);
289
+
290
+ // Set defaults
291
+ sourceConfig.forceExtract = sourceConfig.forceExtract ?? false;
292
+ sourceConfig.keepOriginalUrls = sourceConfig.keepOriginalUrls ?? false;
293
+ sourceConfig.customUrls = sourceConfig.customUrls || [];
294
+
295
+ // Either origin or customUrls must be provided
296
+ if (
297
+ !sourceConfig.origin &&
298
+ (!sourceConfig.customUrls || sourceConfig.customUrls.length === 0)
299
+ ) {
300
+ throw new Error(
301
+ `sources[${index}] must have either origin or customUrls`
302
+ );
252
303
  }
253
- }
254
304
 
255
- // Set defaults
256
- config.customUrls = config.customUrls || [];
257
- config.keepOriginalUrls = config.keepOriginalUrls ?? false;
305
+ // outDir is required unless keepOriginalUrls is true
306
+ if (!sourceConfig.outDir && !sourceConfig.keepOriginalUrls) {
307
+ throw new Error(
308
+ `sources[${index}].outDir is required when keepOriginalUrls is false`
309
+ );
310
+ }
311
+
312
+ // Validate customUrls
313
+ for (const [urlIndex, customUrl] of (
314
+ sourceConfig.customUrls || []
315
+ ).entries()) {
316
+ if (
317
+ !customUrl.title ||
318
+ !customUrl.description ||
319
+ !customUrl.filename ||
320
+ !customUrl.url
321
+ ) {
322
+ throw new Error(
323
+ `sources[${index}].customUrls[${urlIndex}] must have title, description, filename, and url`
324
+ );
325
+ }
326
+ }
327
+ }
258
328
 
259
329
  return config;
260
330
  } catch (error) {
@@ -389,7 +459,7 @@ function cleanupOldFiles(outDir, currentFiles, previousFiles) {
389
459
 
390
460
  /**
391
461
  * Process custom URLs through extraction API
392
- * @param {Array<{title: string, description: string, url: string}>} customUrls - Custom URLs to process
462
+ * @param {Array<{title: string, description: string, filename: string, url: string}>} customUrls - Custom URLs to process
393
463
  * @param {string} apiKey - API key for authentication
394
464
  * @returns {Promise<Record<string, any>>} Extracted files
395
465
  */
@@ -417,8 +487,7 @@ async function processCustomUrls(customUrls, apiKey) {
417
487
  const result = await response.json();
418
488
  if (result.results && result.results.length > 0) {
419
489
  const extracted = result.results[0];
420
- const filename =
421
- customUrl.title.replace(/[^a-zA-Z0-9]/g, "_").toLowerCase() + ".md";
490
+ const filename = customUrl.filename + ".md";
422
491
 
423
492
  files[filename] = {
424
493
  content: extracted.full_content || "",
@@ -428,6 +497,7 @@ async function processCustomUrls(customUrls, apiKey) {
428
497
  publishedDate: extracted.published_date || "",
429
498
  status: 200,
430
499
  tokens: Math.round((extracted.full_content || "").length / 5),
500
+ originalUrl: customUrl.url,
431
501
  };
432
502
  }
433
503
  } else {
@@ -444,6 +514,43 @@ async function processCustomUrls(customUrls, apiKey) {
444
514
  return files;
445
515
  }
446
516
 
517
+ /**
518
+ * Generate combined llms.txt from all sources
519
+ * @param {Array<{title: string, files: Record<string, any>, keepOriginalUrls?: boolean}>} allSources - All processed sources
520
+ * @returns {string} Combined llms.txt content
521
+ */
522
+ function generateCombinedLlmsTxt(allSources) {
523
+ let combinedTxt =
524
+ "# Documentation Collection\n\n> Combined documentation from multiple sources\n\n";
525
+
526
+ for (const source of allSources) {
527
+ combinedTxt += `## ${source.title}\n\n`;
528
+
529
+ // Sort files by path for consistent ordering
530
+ const sortedFiles = Object.entries(source.files).sort(([a], [b]) =>
531
+ a.localeCompare(b)
532
+ );
533
+
534
+ for (const [path, file] of sortedFiles) {
535
+ if (file.content || file.title) {
536
+ const title = file.title || path.replace(".md", "");
537
+ const description = file.description ? `: ${file.description}` : "";
538
+
539
+ // If keepOriginalUrls is true, link to the original URL, otherwise link to the local file
540
+ const link = source.keepOriginalUrls
541
+ ? file.originalUrl
542
+ : path.replace(".md", "");
543
+
544
+ combinedTxt += `- [${title}](${link}) (${file.tokens} tokens)${description}\n`;
545
+ }
546
+ }
547
+
548
+ combinedTxt += "\n";
549
+ }
550
+
551
+ return combinedTxt;
552
+ }
553
+
447
554
  /**
448
555
  * Clear stored API key credentials
449
556
  */
@@ -477,114 +584,145 @@ async function main() {
477
584
  const config = await loadConfig();
478
585
  const apiKey = await getApiKey();
479
586
 
480
- // Ensure output directory exists
587
+ // Ensure top-level output directory exists
481
588
  fs.mkdirSync(config.outDir, { recursive: true });
482
589
 
483
- // Load previous manifest
484
- const previousManifest = loadManifest(config.outDir);
485
- const currentFiles = [];
486
-
590
+ const allSources = [];
487
591
  let totalTokens = 0;
488
592
  let totalPages = 0;
489
593
  let totalErrors = 0;
490
594
 
491
- // Process each origin with its own forceExtract setting
492
- for (const originConfig of config.origins) {
595
+ // Process each source
596
+ for (const [sourceIndex, sourceConfig] of config.sources.entries()) {
597
+ const sourceName = `${sourceConfig.title} (source ${sourceIndex + 1})`;
598
+
493
599
  console.log(
494
- `\n🌐 Processing origin: ${originConfig.origin} (forceExtract: ${originConfig.forceExtract})`
600
+ `\n🌐 Processing ${sourceName} (forceExtract: ${sourceConfig.forceExtract}, keepOriginalUrls: ${sourceConfig.keepOriginalUrls})`
495
601
  );
496
602
 
603
+ // Only ensure source output directory exists if not keeping original URLs
604
+ if (!sourceConfig.keepOriginalUrls && sourceConfig.outDir) {
605
+ fs.mkdirSync(sourceConfig.outDir, { recursive: true });
606
+ }
607
+
608
+ // Load previous manifest for this source (only if we have an outDir)
609
+ const previousManifest = sourceConfig.outDir
610
+ ? loadManifest(sourceConfig.outDir)
611
+ : { files: [], timestamp: new Date().toISOString() };
612
+ const currentFiles = [];
613
+ let sourceFiles = {};
614
+
497
615
  try {
498
- const result = await extractFromSitemap(
499
- originConfig.origin,
500
- originConfig.forceExtract,
501
- apiKey
502
- );
616
+ // Process origin if provided
617
+ if (sourceConfig.origin) {
618
+ const result = await extractFromSitemap(
619
+ sourceConfig.origin,
620
+ sourceConfig.forceExtract,
621
+ apiKey
622
+ );
623
+
624
+ console.log(
625
+ `āœ… Extracted ${result.totalPages} pages with ${result.totalTokens} tokens`
626
+ );
627
+ if (result.errors > 0) {
628
+ console.log(`āš ļø ${result.errors} errors occurred`);
629
+ }
503
630
 
504
- console.log(
505
- `āœ… Extracted ${result.totalPages} pages with ${result.totalTokens} tokens`
506
- );
507
- if (result.errors > 0) {
508
- console.log(`āš ļø ${result.errors} errors occurred`);
631
+ sourceFiles = result.files;
632
+ totalTokens += result.totalTokens;
633
+ totalPages += result.totalPages;
634
+ totalErrors += result.errors;
509
635
  }
510
636
 
511
- // Write files to disk
512
- for (const [filePath, file] of Object.entries(result.files)) {
513
- let filename = filePath;
514
-
515
- if (!config.keepOriginalUrls) {
516
- // Create domain-specific subdirectory
517
- const domain = new URL(
518
- originConfig.origin.startsWith("http")
519
- ? originConfig.origin
520
- : `https://${originConfig.origin}`
521
- ).hostname;
522
- const domainDir = path.join(config.outDir, domain);
523
- fs.mkdirSync(domainDir, { recursive: true });
524
- filename = path.join(
525
- domain,
526
- filePath.startsWith("/") ? filePath.slice(1) : filePath
527
- );
528
- } else {
529
- filename = filePath.startsWith("/") ? filePath.slice(1) : filePath;
637
+ // Process custom URLs for this source
638
+ if (sourceConfig.customUrls && sourceConfig.customUrls.length > 0) {
639
+ console.log(
640
+ `šŸ“‹ Processing ${sourceConfig.customUrls.length} custom URLs for this source...`
641
+ );
642
+ const customFiles = await processCustomUrls(
643
+ sourceConfig.customUrls,
644
+ apiKey
645
+ );
646
+
647
+ // Merge custom files with sitemap files
648
+ sourceFiles = { ...sourceFiles, ...customFiles };
649
+
650
+ for (const file of Object.values(customFiles)) {
651
+ totalTokens += file.tokens;
652
+ totalPages++;
530
653
  }
654
+ }
531
655
 
532
- const fullFilePath = path.join(config.outDir, filename);
533
- const fileDir = path.dirname(fullFilePath);
656
+ // Write files to source directory (only if not keeping original URLs)
657
+ if (!sourceConfig.keepOriginalUrls && sourceConfig.outDir) {
658
+ for (const [filePath, file] of Object.entries(sourceFiles)) {
659
+ let filename = filePath.startsWith("/")
660
+ ? filePath.slice(1)
661
+ : filePath;
534
662
 
535
- fs.mkdirSync(fileDir, { recursive: true });
536
- fs.writeFileSync(fullFilePath, file.content);
537
- currentFiles.push(filename);
663
+ const fullFilePath = path.join(sourceConfig.outDir, filename);
664
+ const fileDir = path.dirname(fullFilePath);
538
665
 
539
- console.log(`šŸ“ Wrote: ${filename} (${file.tokens} tokens)`);
540
- }
666
+ fs.mkdirSync(fileDir, { recursive: true });
667
+ fs.writeFileSync(fullFilePath, file.content);
668
+ currentFiles.push(filename);
541
669
 
542
- totalTokens += result.totalTokens;
543
- totalPages += result.totalPages;
544
- totalErrors += result.errors;
545
- } catch (error) {
546
- console.error(
547
- `āŒ Error processing ${originConfig.origin}:`,
548
- error.message
549
- );
550
- totalErrors++;
551
- }
552
- }
670
+ console.log(
671
+ `šŸ“ Wrote: ${path.join(sourceConfig.outDir, filename)} (${
672
+ file.tokens
673
+ } tokens)`
674
+ );
675
+ }
553
676
 
554
- // Process custom URLs
555
- if (config.customUrls.length > 0) {
556
- console.log(`\nšŸ“‹ Processing ${config.customUrls.length} custom URLs...`);
557
- const customFiles = await processCustomUrls(config.customUrls, apiKey);
677
+ // Clean up old files for this source
678
+ if (previousManifest.files.length > 0) {
679
+ cleanupOldFiles(
680
+ sourceConfig.outDir,
681
+ currentFiles,
682
+ previousManifest.files
683
+ );
684
+ }
558
685
 
559
- for (const [filename, file] of Object.entries(customFiles)) {
560
- const filePath = path.join(config.outDir, filename);
561
- fs.writeFileSync(filePath, file.content);
562
- currentFiles.push(filename);
563
- totalTokens += file.tokens;
564
- totalPages++;
686
+ // Save manifest for this source
687
+ const newManifest = {
688
+ files: currentFiles,
689
+ timestamp: new Date().toISOString(),
690
+ };
691
+ saveManifest(sourceConfig.outDir, newManifest);
692
+ } else {
693
+ console.log(
694
+ `šŸ“‹ Keeping original URLs - not saving files locally for ${sourceName}`
695
+ );
696
+ }
565
697
 
566
- console.log(`šŸ“ Wrote: ${filename} (${file.tokens} tokens)`);
698
+ // Add to all sources for combined llms.txt
699
+ allSources.push({
700
+ title: sourceConfig.title,
701
+ files: sourceFiles,
702
+ keepOriginalUrls: sourceConfig.keepOriginalUrls,
703
+ });
704
+ } catch (error) {
705
+ console.error(`āŒ Error processing ${sourceName}:`, error.message);
706
+ totalErrors++;
567
707
  }
568
708
  }
569
709
 
570
- // Clean up old files
571
- if (previousManifest.files.length > 0) {
572
- cleanupOldFiles(config.outDir, currentFiles, previousManifest.files);
710
+ // Generate and write combined llms.txt to top-level outDir
711
+ if (allSources.length > 0) {
712
+ const combinedLlmsTxt = generateCombinedLlmsTxt(allSources);
713
+ const combinedLlmsTxtPath = path.join(config.outDir, "llms.txt");
714
+ fs.writeFileSync(combinedLlmsTxtPath, combinedLlmsTxt);
715
+ console.log(`\nšŸ“‹ Generated combined llms.txt: ${combinedLlmsTxtPath}`);
573
716
  }
574
717
 
575
- // Save new manifest
576
- const newManifest = {
577
- files: currentFiles,
578
- timestamp: new Date().toISOString(),
579
- };
580
- saveManifest(config.outDir, newManifest);
581
-
582
718
  console.log("\n✨ Extraction completed!");
583
719
  console.log(`šŸ“Š Total: ${totalPages} pages, ${totalTokens} tokens`);
584
720
  if (totalErrors > 0) {
585
721
  console.log(`āš ļø Errors: ${totalErrors}`);
586
722
  }
587
- console.log(`šŸ“ Output directory: ${path.resolve(config.outDir)}`);
723
+ console.log(
724
+ `šŸ“ Top-level output directory: ${path.resolve(config.outDir)}`
725
+ );
588
726
  console.log("\nšŸ’” Use --clear-credentials to remove stored API key");
589
727
  } catch (error) {
590
728
  console.error("šŸ’„ Fatal error:", error.message);
package/mod.js CHANGED
@@ -8,6 +8,7 @@
8
8
  * @property {boolean} extracted - Whether the content was extracted or directly fetched
9
9
  * @property {number} status - HTTP status code or processing status
10
10
  * @property {number} tokens - Number of tokens in the content
11
+ * @property {string} originalUrl - The original URL of the content
11
12
  */
12
13
 
13
14
  /**
@@ -62,6 +63,7 @@ export async function extractFromSitemap(origin, forceExtract = false, apiKey) {
62
63
  status: result.status,
63
64
  tokens: Math.round(result.content.length / 5),
64
65
  publishedDate: result.publishedDate || "",
66
+ originalUrl: urlStr,
65
67
  error: result.error,
66
68
  };
67
69
 
@@ -80,6 +82,7 @@ export async function extractFromSitemap(origin, forceExtract = false, apiKey) {
80
82
  status: 0,
81
83
  tokens: 0,
82
84
  publishedDate: "",
85
+ originalUrl: urlStr,
83
86
  };
84
87
  if (!forceExtract) {
85
88
  urlsNeedingExtract.push(urlStr);
@@ -108,6 +111,7 @@ export async function extractFromSitemap(origin, forceExtract = false, apiKey) {
108
111
  status: 0,
109
112
  tokens: 0,
110
113
  publishedDate: "",
114
+ originalUrl: result.url,
111
115
  };
112
116
 
113
117
  const content = result.full_content || existing.content;
@@ -122,6 +126,7 @@ export async function extractFromSitemap(origin, forceExtract = false, apiKey) {
122
126
  publishedDate: result.published_date || existing.publishedDate,
123
127
  status: existing.status,
124
128
  tokens: Math.round(content.length / 5),
129
+ originalUrl: existing.originalUrl,
125
130
  };
126
131
  }
127
132
 
@@ -137,18 +142,6 @@ export async function extractFromSitemap(origin, forceExtract = false, apiKey) {
137
142
  }
138
143
  }
139
144
 
140
- // Generate llms.txt
141
- const llmsTxt = generateLlmsTxt(origin, files);
142
- files["/llms.txt"] = {
143
- content: llmsTxt,
144
- title: "LLMs.txt",
145
- description: "LLM-friendly content listing",
146
- extracted: false,
147
- publishedDate: "",
148
- status: 200,
149
- tokens: Math.round(llmsTxt.length / 5),
150
- };
151
-
152
145
  // Sort files by path
153
146
  const sortedFiles = Object.keys(files)
154
147
  .sort()
@@ -162,7 +155,7 @@ export async function extractFromSitemap(origin, forceExtract = false, apiKey) {
162
155
  (sum, file) => sum + file.tokens,
163
156
  0
164
157
  );
165
- const totalPages = Object.keys(sortedFiles).length - 1; // Exclude llms.txt from page count
158
+ const totalPages = Object.keys(sortedFiles).length;
166
159
  const errors = Object.values(sortedFiles).filter((file) => file.error).length;
167
160
  const processingTimeMs = Date.now() - startTime;
168
161
 
@@ -527,44 +520,6 @@ function getPathFromUrl(urlStr) {
527
520
  }
528
521
  }
529
522
 
530
- /**
531
- * Generate llms.txt content
532
- * @param {string} origin - Site origin
533
- * @param {Record<string, any>} files - Files object
534
- * @returns {string} Generated llms.txt content
535
- */
536
- function generateLlmsTxt(origin, files) {
537
- // Find homepage for top-level description
538
- const homepageFile = files["/index.html.md"] || files[Object.keys(files)[0]];
539
- const siteTitle =
540
- homepageFile?.title ||
541
- new URL(origin.startsWith("http") ? origin : `https://${origin}`).hostname;
542
- const siteDescription =
543
- homepageFile?.description || `Documentation for ${siteTitle}`;
544
-
545
- let llmsTxt = `# ${siteTitle}\n\n> ${siteDescription}\n\n`;
546
-
547
- // Add documentation section
548
- llmsTxt += "## Documentation\n\n";
549
-
550
- // Sort files by path for consistent ordering
551
- const sortedFiles = Object.entries(files)
552
- .filter(([path]) => path !== "/llms.txt")
553
- .sort(([a], [b]) => a.localeCompare(b));
554
-
555
- for (const [path, file] of sortedFiles) {
556
- if (file.content || file.title) {
557
- const title = file.title || path.replace(".md", "");
558
- const description = file.description ? `: ${file.description}` : "";
559
- llmsTxt += `- [${title}](${path.replace(".md", "")}) (${
560
- file.tokens
561
- } tokens)${description}\n`;
562
- }
563
- }
564
-
565
- return llmsTxt;
566
- }
567
-
568
523
  /**
569
524
  * Call Parallel Extract API for multiple URLs
570
525
  * @param {string[]} urls - URLs to extract
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "extract-from-sitemap",
3
3
  "bin": "cli.js",
4
- "version": "0.0.5",
4
+ "version": "0.0.7",
5
5
  "main": "mod.js",
6
6
  "description": "A module and CLI that allows extracting all pages from a sitemap into markdown and a llms.txt, using Parallel.ai APIs.",
7
7
  "files": [