extract-from-sitemap 0.0.6 → 0.0.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. package/cli.js +113 -79
  2. package/mod.js +6 -51
  3. package/package.json +1 -1
package/cli.js CHANGED
@@ -11,17 +11,18 @@ const { extractFromSitemap } = require("./mod.js");
11
11
 
12
12
  /**
13
13
  * @typedef {Object} SourceConfig
14
+ * @property {string} title - The title for this source
14
15
  * @property {string} [origin] - The origin URL to process (optional)
15
- * @property {boolean} forceExtract - Whether to force extraction for this source
16
- * @property {string} outDir - Output directory for this source's extracted files
17
- * @property {Array<{title: string, description: string, url: string}>} [customUrls] - Custom URLs to extract for this source
16
+ * @property {string} [outDir] - Output directory for this source's extracted files
17
+ * @property {boolean} [forceExtract] - Whether to force extraction for this source
18
+ * @property {boolean} [keepOriginalUrls] - Whether to keep original URL structure and not save files locally
19
+ * @property {Array<{title: string, description: string, filename: string, url: string}>} [customUrls] - Custom URLs to extract for this source
18
20
  */
19
21
 
20
22
  /**
21
23
  * @typedef {Object} Config
22
24
  * @property {string} outDir - Top-level output directory for combined llms.txt
23
25
  * @property {SourceConfig[]} sources - Array of source configurations
24
- * @property {boolean} keepOriginalUrls - Whether to keep original URL structure
25
26
  */
26
27
 
27
28
  /**
@@ -220,28 +221,47 @@ async function loadConfig() {
220
221
  outDir: "./docs",
221
222
  sources: [
222
223
  {
224
+ title: "Parallel AI Documentation",
223
225
  origin: "https://docs.parallel.ai",
224
226
  forceExtract: false,
225
227
  outDir: "./docs/parallel-docs",
228
+ keepOriginalUrls: false,
226
229
  },
227
230
  {
231
+ title: "Parallel AI Website",
228
232
  origin: "https://parallel.ai",
229
233
  forceExtract: true,
230
234
  outDir: "./docs/parallel-main",
235
+ keepOriginalUrls: false,
231
236
  },
232
237
  {
238
+ title: "Custom Resources",
233
239
  forceExtract: true,
234
240
  outDir: "./docs/custom",
241
+ keepOriginalUrls: false,
235
242
  customUrls: [
236
243
  {
237
244
  title: "Custom Page",
238
245
  description: "A custom page to extract",
246
+ filename: "custom-page",
239
247
  url: "https://example.com/page",
240
248
  },
241
249
  ],
242
250
  },
251
+ {
252
+ title: "External References",
253
+ keepOriginalUrls: true,
254
+ forceExtract: false,
255
+ customUrls: [
256
+ {
257
+ title: "External API Guide",
258
+ description: "Third-party API documentation",
259
+ filename: "external-api",
260
+ url: "https://external.com/api-guide",
261
+ },
262
+ ],
263
+ },
243
264
  ],
244
- keepOriginalUrls: false,
245
265
  },
246
266
  null,
247
267
  2
@@ -263,12 +283,15 @@ async function loadConfig() {
263
283
  if (typeof sourceConfig !== "object" || sourceConfig === null) {
264
284
  throw new Error(`sources[${index}] must be an object`);
265
285
  }
266
- if (!sourceConfig.outDir) {
267
- throw new Error(`sources[${index}].outDir is required`);
268
- }
269
- if (typeof sourceConfig.forceExtract !== "boolean") {
270
- throw new Error(`sources[${index}].forceExtract must be a boolean`);
286
+ if (!sourceConfig.title) {
287
+ throw new Error(`sources[${index}].title is required`);
271
288
  }
289
+
290
+ // Set defaults
291
+ sourceConfig.forceExtract = sourceConfig.forceExtract ?? false;
292
+ sourceConfig.keepOriginalUrls = sourceConfig.keepOriginalUrls ?? false;
293
+ sourceConfig.customUrls = sourceConfig.customUrls || [];
294
+
272
295
  // Either origin or customUrls must be provided
273
296
  if (
274
297
  !sourceConfig.origin &&
@@ -278,14 +301,29 @@ async function loadConfig() {
278
301
  `sources[${index}] must have either origin or customUrls`
279
302
  );
280
303
  }
281
- }
282
304
 
283
- // Set defaults
284
- config.keepOriginalUrls = config.keepOriginalUrls ?? false;
305
+ // outDir is required unless keepOriginalUrls is true
306
+ if (!sourceConfig.outDir && !sourceConfig.keepOriginalUrls) {
307
+ throw new Error(
308
+ `sources[${index}].outDir is required when keepOriginalUrls is false`
309
+ );
310
+ }
285
311
 
286
- // Set default customUrls for each source
287
- for (const sourceConfig of config.sources) {
288
- sourceConfig.customUrls = sourceConfig.customUrls || [];
312
+ // Validate customUrls
313
+ for (const [urlIndex, customUrl] of (
314
+ sourceConfig.customUrls || []
315
+ ).entries()) {
316
+ if (
317
+ !customUrl.title ||
318
+ !customUrl.description ||
319
+ !customUrl.filename ||
320
+ !customUrl.url
321
+ ) {
322
+ throw new Error(
323
+ `sources[${index}].customUrls[${urlIndex}] must have title, description, filename, and url`
324
+ );
325
+ }
326
+ }
289
327
  }
290
328
 
291
329
  return config;
@@ -421,7 +459,7 @@ function cleanupOldFiles(outDir, currentFiles, previousFiles) {
421
459
 
422
460
  /**
423
461
  * Process custom URLs through extraction API
424
- * @param {Array<{title: string, description: string, url: string}>} customUrls - Custom URLs to process
462
+ * @param {Array<{title: string, description: string, filename: string, url: string}>} customUrls - Custom URLs to process
425
463
  * @param {string} apiKey - API key for authentication
426
464
  * @returns {Promise<Record<string, any>>} Extracted files
427
465
  */
@@ -449,8 +487,7 @@ async function processCustomUrls(customUrls, apiKey) {
449
487
  const result = await response.json();
450
488
  if (result.results && result.results.length > 0) {
451
489
  const extracted = result.results[0];
452
- const filename =
453
- customUrl.title.replace(/[^a-zA-Z0-9]/g, "_").toLowerCase() + ".md";
490
+ const filename = customUrl.filename + ".md";
454
491
 
455
492
  files[filename] = {
456
493
  content: extracted.full_content || "",
@@ -460,6 +497,7 @@ async function processCustomUrls(customUrls, apiKey) {
460
497
  publishedDate: extracted.published_date || "",
461
498
  status: 200,
462
499
  tokens: Math.round((extracted.full_content || "").length / 5),
500
+ originalUrl: customUrl.url,
463
501
  };
464
502
  }
465
503
  } else {
@@ -478,7 +516,7 @@ async function processCustomUrls(customUrls, apiKey) {
478
516
 
479
517
  /**
480
518
  * Generate combined llms.txt from all sources
481
- * @param {Array<{sourceName: string, files: Record<string, any>, origin?: string}>} allSources - All processed sources
519
+ * @param {Array<{title: string, files: Record<string, any>, keepOriginalUrls?: boolean}>} allSources - All processed sources
482
520
  * @returns {string} Combined llms.txt content
483
521
  */
484
522
  function generateCombinedLlmsTxt(allSources) {
@@ -486,28 +524,22 @@ function generateCombinedLlmsTxt(allSources) {
486
524
  "# Documentation Collection\n\n> Combined documentation from multiple sources\n\n";
487
525
 
488
526
  for (const source of allSources) {
489
- const sourceName = source.origin
490
- ? new URL(
491
- source.origin.startsWith("http")
492
- ? source.origin
493
- : `https://${source.origin}`
494
- ).hostname
495
- : source.sourceName;
496
-
497
- combinedTxt += `## ${sourceName}\n\n`;
527
+ combinedTxt += `## ${source.title}\n\n`;
498
528
 
499
529
  // Sort files by path for consistent ordering
500
- const sortedFiles = Object.entries(source.files)
501
- .filter(([path]) => path !== "/llms.txt")
502
- .sort(([a], [b]) => a.localeCompare(b));
530
+ const sortedFiles = Object.entries(source.files).sort(([a], [b]) =>
531
+ a.localeCompare(b)
532
+ );
503
533
 
504
534
  for (const [path, file] of sortedFiles) {
505
535
  if (file.content || file.title) {
506
536
  const title = file.title || path.replace(".md", "");
507
537
  const description = file.description ? `: ${file.description}` : "";
508
- combinedTxt += `- [${title}](${path.replace(".md", "")}) (${
509
- file.tokens
510
- } tokens)${description}\n`;
538
+
539
+ // If keepOriginalUrls is true, link to the original URL, otherwise link to the local file
540
+ const link = source.keepOriginalUrls ? file.originalUrl : path;
541
+
542
+ combinedTxt += `- [${title}](${link}) (${file.tokens} tokens)${description}\n`;
511
543
  }
512
544
  }
513
545
 
@@ -560,19 +592,21 @@ async function main() {
560
592
 
561
593
  // Process each source
562
594
  for (const [sourceIndex, sourceConfig] of config.sources.entries()) {
563
- const sourceName = sourceConfig.origin
564
- ? `source ${sourceIndex + 1} (${sourceConfig.origin})`
565
- : `source ${sourceIndex + 1} (custom URLs)`;
595
+ const sourceName = `${sourceConfig.title} (source ${sourceIndex + 1})`;
566
596
 
567
597
  console.log(
568
- `\n🌐 Processing ${sourceName} (forceExtract: ${sourceConfig.forceExtract})`
598
+ `\n🌐 Processing ${sourceName} (forceExtract: ${sourceConfig.forceExtract}, keepOriginalUrls: ${sourceConfig.keepOriginalUrls})`
569
599
  );
570
600
 
571
- // Ensure source output directory exists
572
- fs.mkdirSync(sourceConfig.outDir, { recursive: true });
601
+ // Only ensure source output directory exists if not keeping original URLs
602
+ if (!sourceConfig.keepOriginalUrls && sourceConfig.outDir) {
603
+ fs.mkdirSync(sourceConfig.outDir, { recursive: true });
604
+ }
573
605
 
574
- // Load previous manifest for this source
575
- const previousManifest = loadManifest(sourceConfig.outDir);
606
+ // Load previous manifest for this source (only if we have an outDir)
607
+ const previousManifest = sourceConfig.outDir
608
+ ? loadManifest(sourceConfig.outDir)
609
+ : { files: [], timestamp: new Date().toISOString() };
576
610
  const currentFiles = [];
577
611
  let sourceFiles = {};
578
612
 
@@ -617,53 +651,53 @@ async function main() {
617
651
  }
618
652
  }
619
653
 
620
- // Write files to source directory
621
- for (const [filePath, file] of Object.entries(sourceFiles)) {
622
- let filename = filePath;
654
+ // Write files to source directory (only if not keeping original URLs)
655
+ if (!sourceConfig.keepOriginalUrls && sourceConfig.outDir) {
656
+ for (const [filePath, file] of Object.entries(sourceFiles)) {
657
+ let filename = filePath.startsWith("/")
658
+ ? filePath.slice(1)
659
+ : filePath;
623
660
 
624
- if (!config.keepOriginalUrls && sourceConfig.origin) {
625
- // Use relative path within source directory
626
- filename = filePath.startsWith("/") ? filePath.slice(1) : filePath;
627
- } else if (!sourceConfig.origin) {
628
- // For custom URL sources, use simple filename
629
- filename = filePath.startsWith("/") ? filePath.slice(1) : filePath;
630
- }
661
+ const fullFilePath = path.join(sourceConfig.outDir, filename);
662
+ const fileDir = path.dirname(fullFilePath);
631
663
 
632
- const fullFilePath = path.join(sourceConfig.outDir, filename);
633
- const fileDir = path.dirname(fullFilePath);
664
+ fs.mkdirSync(fileDir, { recursive: true });
665
+ fs.writeFileSync(fullFilePath, file.content);
666
+ currentFiles.push(filename);
634
667
 
635
- fs.mkdirSync(fileDir, { recursive: true });
636
- fs.writeFileSync(fullFilePath, file.content);
637
- currentFiles.push(filename);
668
+ console.log(
669
+ `📝 Wrote: ${path.join(sourceConfig.outDir, filename)} (${
670
+ file.tokens
671
+ } tokens)`
672
+ );
673
+ }
638
674
 
639
- console.log(
640
- `📝 Wrote: ${path.join(sourceConfig.outDir, filename)} (${
641
- file.tokens
642
- } tokens)`
643
- );
644
- }
675
+ // Clean up old files for this source
676
+ if (previousManifest.files.length > 0) {
677
+ cleanupOldFiles(
678
+ sourceConfig.outDir,
679
+ currentFiles,
680
+ previousManifest.files
681
+ );
682
+ }
645
683
 
646
- // Clean up old files for this source
647
- if (previousManifest.files.length > 0) {
648
- cleanupOldFiles(
649
- sourceConfig.outDir,
650
- currentFiles,
651
- previousManifest.files
684
+ // Save manifest for this source
685
+ const newManifest = {
686
+ files: currentFiles,
687
+ timestamp: new Date().toISOString(),
688
+ };
689
+ saveManifest(sourceConfig.outDir, newManifest);
690
+ } else {
691
+ console.log(
692
+ `📋 Keeping original URLs - not saving files locally for ${sourceName}`
652
693
  );
653
694
  }
654
695
 
655
- // Save manifest for this source
656
- const newManifest = {
657
- files: currentFiles,
658
- timestamp: new Date().toISOString(),
659
- };
660
- saveManifest(sourceConfig.outDir, newManifest);
661
-
662
696
  // Add to all sources for combined llms.txt
663
697
  allSources.push({
664
- sourceName: `Source ${sourceIndex + 1}`,
665
- origin: sourceConfig.origin,
698
+ title: sourceConfig.title,
666
699
  files: sourceFiles,
700
+ keepOriginalUrls: sourceConfig.keepOriginalUrls,
667
701
  });
668
702
  } catch (error) {
669
703
  console.error(`❌ Error processing ${sourceName}:`, error.message);
package/mod.js CHANGED
@@ -8,6 +8,7 @@
8
8
  * @property {boolean} extracted - Whether the content was extracted or directly fetched
9
9
  * @property {number} status - HTTP status code or processing status
10
10
  * @property {number} tokens - Number of tokens in the content
11
+ * @property {string} originalUrl - The original URL of the content
11
12
  */
12
13
 
13
14
  /**
@@ -62,6 +63,7 @@ export async function extractFromSitemap(origin, forceExtract = false, apiKey) {
62
63
  status: result.status,
63
64
  tokens: Math.round(result.content.length / 5),
64
65
  publishedDate: result.publishedDate || "",
66
+ originalUrl: urlStr,
65
67
  error: result.error,
66
68
  };
67
69
 
@@ -80,6 +82,7 @@ export async function extractFromSitemap(origin, forceExtract = false, apiKey) {
80
82
  status: 0,
81
83
  tokens: 0,
82
84
  publishedDate: "",
85
+ originalUrl: urlStr,
83
86
  };
84
87
  if (!forceExtract) {
85
88
  urlsNeedingExtract.push(urlStr);
@@ -108,6 +111,7 @@ export async function extractFromSitemap(origin, forceExtract = false, apiKey) {
108
111
  status: 0,
109
112
  tokens: 0,
110
113
  publishedDate: "",
114
+ originalUrl: result.url,
111
115
  };
112
116
 
113
117
  const content = result.full_content || existing.content;
@@ -122,6 +126,7 @@ export async function extractFromSitemap(origin, forceExtract = false, apiKey) {
122
126
  publishedDate: result.published_date || existing.publishedDate,
123
127
  status: existing.status,
124
128
  tokens: Math.round(content.length / 5),
129
+ originalUrl: existing.originalUrl,
125
130
  };
126
131
  }
127
132
 
@@ -137,18 +142,6 @@ export async function extractFromSitemap(origin, forceExtract = false, apiKey) {
137
142
  }
138
143
  }
139
144
 
140
- // Generate llms.txt
141
- const llmsTxt = generateLlmsTxt(origin, files);
142
- files["/llms.txt"] = {
143
- content: llmsTxt,
144
- title: "LLMs.txt",
145
- description: "LLM-friendly content listing",
146
- extracted: false,
147
- publishedDate: "",
148
- status: 200,
149
- tokens: Math.round(llmsTxt.length / 5),
150
- };
151
-
152
145
  // Sort files by path
153
146
  const sortedFiles = Object.keys(files)
154
147
  .sort()
@@ -162,7 +155,7 @@ export async function extractFromSitemap(origin, forceExtract = false, apiKey) {
162
155
  (sum, file) => sum + file.tokens,
163
156
  0
164
157
  );
165
- const totalPages = Object.keys(sortedFiles).length - 1; // Exclude llms.txt from page count
158
+ const totalPages = Object.keys(sortedFiles).length;
166
159
  const errors = Object.values(sortedFiles).filter((file) => file.error).length;
167
160
  const processingTimeMs = Date.now() - startTime;
168
161
 
@@ -527,44 +520,6 @@ function getPathFromUrl(urlStr) {
527
520
  }
528
521
  }
529
522
 
530
- /**
531
- * Generate llms.txt content
532
- * @param {string} origin - Site origin
533
- * @param {Record<string, any>} files - Files object
534
- * @returns {string} Generated llms.txt content
535
- */
536
- function generateLlmsTxt(origin, files) {
537
- // Find homepage for top-level description
538
- const homepageFile = files["/index.html.md"] || files[Object.keys(files)[0]];
539
- const siteTitle =
540
- homepageFile?.title ||
541
- new URL(origin.startsWith("http") ? origin : `https://${origin}`).hostname;
542
- const siteDescription =
543
- homepageFile?.description || `Documentation for ${siteTitle}`;
544
-
545
- let llmsTxt = `# ${siteTitle}\n\n> ${siteDescription}\n\n`;
546
-
547
- // Add documentation section
548
- llmsTxt += "## Documentation\n\n";
549
-
550
- // Sort files by path for consistent ordering
551
- const sortedFiles = Object.entries(files)
552
- .filter(([path]) => path !== "/llms.txt")
553
- .sort(([a], [b]) => a.localeCompare(b));
554
-
555
- for (const [path, file] of sortedFiles) {
556
- if (file.content || file.title) {
557
- const title = file.title || path.replace(".md", "");
558
- const description = file.description ? `: ${file.description}` : "";
559
- llmsTxt += `- [${title}](${path.replace(".md", "")}) (${
560
- file.tokens
561
- } tokens)${description}\n`;
562
- }
563
- }
564
-
565
- return llmsTxt;
566
- }
567
-
568
523
  /**
569
524
  * Call Parallel Extract API for multiple URLs
570
525
  * @param {string[]} urls - URLs to extract
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "extract-from-sitemap",
3
3
  "bin": "cli.js",
4
- "version": "0.0.6",
4
+ "version": "0.0.8",
5
5
  "main": "mod.js",
6
6
  "description": "A module and CLI that allows extracting all pages from a sitemap into markdown and a llms.txt, using Parallel.ai APIs.",
7
7
  "files": [