extract-from-sitemap 0.0.22 → 0.0.24

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. package/cli.js +23 -3
  2. package/mod.js +58 -41
  3. package/package.json +2 -2
package/cli.js CHANGED
@@ -222,6 +222,7 @@ async function loadConfig() {
222
222
  outDir: "./docs",
223
223
  sources: [
224
224
  {
225
+ type: "llms.txt",
225
226
  title: "Parallel AI Documentation",
226
227
  origin: "https://docs.parallel.ai",
227
228
  forceExtract: false,
@@ -229,6 +230,7 @@ async function loadConfig() {
229
230
  keepOriginalUrls: false,
230
231
  },
231
232
  {
233
+ type: "sitemap",
232
234
  title: "Parallel AI Website",
233
235
  origin: "https://parallel.ai",
234
236
  forceExtract: true,
@@ -236,6 +238,7 @@ async function loadConfig() {
236
238
  keepOriginalUrls: false,
237
239
  },
238
240
  {
241
+ type: "custom",
239
242
  title: "Custom Resources",
240
243
  forceExtract: true,
241
244
  outDir: "./docs/custom",
@@ -250,6 +253,7 @@ async function loadConfig() {
250
253
  ],
251
254
  },
252
255
  {
256
+ type: "custom",
253
257
  title: "External References",
254
258
  keepOriginalUrls: true,
255
259
  forceExtract: false,
@@ -291,6 +295,14 @@ async function loadConfig() {
291
295
  if (!sourceConfig.title) {
292
296
  throw new Error(`sources[${index}].title is required`);
293
297
  }
298
+ if (!sourceConfig.type) {
299
+ throw new Error(`sources[${index}].type is required`);
300
+ }
301
+ if (!["llms.txt", "sitemap", "custom"].includes(sourceConfig.type)) {
302
+ throw new Error(
303
+ `sources[${index}].type must be one of: llms.txt, sitemap, custom`
304
+ );
305
+ }
294
306
 
295
307
  // Set defaults
296
308
  sourceConfig.forceExtract = sourceConfig.forceExtract ?? false;
@@ -313,13 +325,21 @@ async function loadConfig() {
313
325
  }
314
326
  }
315
327
 
316
- // Either origin or customUrls must be provided
328
+ // Validate type-specific requirements
329
+ if (
330
+ (sourceConfig.type === "llms.txt" || sourceConfig.type === "sitemap") &&
331
+ !sourceConfig.origin
332
+ ) {
333
+ throw new Error(
334
+ `sources[${index}] with type '${sourceConfig.type}' requires origin`
335
+ );
336
+ }
317
337
  if (
318
- !sourceConfig.origin &&
338
+ sourceConfig.type === "custom" &&
319
339
  (!sourceConfig.customUrls || sourceConfig.customUrls.length === 0)
320
340
  ) {
321
341
  throw new Error(
322
- `sources[${index}] must have either origin or customUrls`
342
+ `sources[${index}] with type 'custom' requires customUrls with at least one entry`
323
343
  );
324
344
  }
325
345
 
package/mod.js CHANGED
@@ -26,8 +26,9 @@ import { parseLlmsTxt } from "parse-llms-txt";
26
26
 
27
27
  /**
28
28
  * @typedef {Object} SourceConfig
29
+ * @property {"llms.txt" | "sitemap" | "custom"} type - The source type
29
30
  * @property {string} title - The title for this source
30
- * @property {string} [origin] - The origin URL to process (optional)
31
+ * @property {string} [origin] - The origin URL to process (required for llms.txt and sitemap types)
31
32
  * @property {string} [outDir] - Output directory for this source's extracted files
32
33
  * @property {boolean} [forceExtract] - Whether to force extraction for this source
33
34
  * @property {boolean} [keepOriginalUrls] - Whether to keep original URL structure and not save files locally
@@ -278,8 +279,34 @@ async function extractFromLlmsTxtEntries(
278
279
  }
279
280
 
280
281
  /**
281
- * Extract content from sitemap URLs with markdown variant detection
282
- * Tries llms.txt first if available, then falls back to sitemap
282
+ * Extract content from llms.txt only (no fallback to sitemap)
283
+ * @param {string} origin - The origin URL to extract from
284
+ * @param {string} apiKey - Parallel API key
285
+ * @param {string} [titleRemovePattern] - Optional regex pattern to remove from titles
286
+ * @returns {Promise<ResponseData>}
287
+ */
288
+ export async function extractFromLlmsTxt(origin, apiKey, titleRemovePattern) {
289
+ const llmsTxtContent = await fetchLlmsTxt(origin);
290
+ if (!llmsTxtContent) {
291
+ throw new Error(`Could not find llms.txt for ${origin}`);
292
+ }
293
+
294
+ const llmsTxt = parseLlmsTxt(llmsTxtContent);
295
+ const totalEntries = llmsTxt.sections.reduce(
296
+ (sum, section) => sum + section.files.length,
297
+ 0,
298
+ );
299
+
300
+ if (totalEntries === 0) {
301
+ throw new Error(`llms.txt found but contains no entries for ${origin}`);
302
+ }
303
+
304
+ console.log(`Found llms.txt with ${totalEntries} entries for ${origin}`);
305
+ return extractFromLlmsTxtEntries(llmsTxt, origin, apiKey, titleRemovePattern);
306
+ }
307
+
308
+ /**
309
+ * Extract content from sitemap URLs with markdown variant detection (no llms.txt fallback)
283
310
  * @param {string} origin - The origin URL to extract from
284
311
  * @param {boolean} forceExtract - Whether to force using extract API instead of markdown variants
285
312
  * @param {string} apiKey - Parallel API key
@@ -296,29 +323,9 @@ export async function extractFromSitemap(
296
323
  let fetchCount = 0;
297
324
  let extractApiCallCount = 0;
298
325
 
299
- // Try llms.txt first
300
- const llmsTxtContent = await fetchLlmsTxt(origin);
301
- if (llmsTxtContent) {
302
- const llmsTxt = parseLlmsTxt(llmsTxtContent);
303
- const totalEntries = llmsTxt.sections.reduce(
304
- (sum, section) => sum + section.files.length,
305
- 0,
306
- );
307
- if (totalEntries > 0) {
308
- console.log(`Found llms.txt with ${totalEntries} entries for ${origin}`);
309
- return extractFromLlmsTxtEntries(
310
- llmsTxt,
311
- origin,
312
- apiKey,
313
- titleRemovePattern,
314
- );
315
- }
316
- }
317
-
318
- // Fall back to sitemap discovery
319
326
  const sitemapUrl = await discoverSitemap(origin);
320
327
  if (!sitemapUrl) {
321
- throw new Error(`Could not find sitemap or llms.txt for ${origin}`);
328
+ throw new Error(`Could not find sitemap for ${origin}`);
322
329
  }
323
330
 
324
331
  // Parse sitemap and get URLs
@@ -531,11 +538,10 @@ export async function processLLMTextConfig(config, apiKey) {
531
538
  let sourceFiles = {};
532
539
 
533
540
  try {
534
- // Process origin if provided
535
- if (sourceConfig.origin) {
536
- const result = await extractFromSitemap(
541
+ // Process based on type
542
+ if (sourceConfig.type === "llms.txt" && sourceConfig.origin) {
543
+ const result = await extractFromLlmsTxt(
537
544
  sourceConfig.origin,
538
- sourceConfig.forceExtract || false,
539
545
  apiKey,
540
546
  sourceConfig.titleRemovePattern,
541
547
  );
@@ -544,22 +550,33 @@ export async function processLLMTextConfig(config, apiKey) {
544
550
  totalTokens += result.totalTokens;
545
551
  totalPages += result.totalPages;
546
552
  totalErrors += result.errors;
547
- }
548
-
549
- // Process custom URLs for this source
550
- if (sourceConfig.customUrls && sourceConfig.customUrls.length > 0) {
551
- const customFiles = await processCustomUrls(
552
- sourceConfig.customUrls,
553
+ } else if (sourceConfig.type === "sitemap" && sourceConfig.origin) {
554
+ const result = await extractFromSitemap(
555
+ sourceConfig.origin,
556
+ sourceConfig.forceExtract || false,
553
557
  apiKey,
558
+ sourceConfig.titleRemovePattern,
554
559
  );
555
560
 
556
- // Merge custom files with sitemap files
557
- sourceFiles = { ...sourceFiles, ...customFiles };
558
-
559
- for (const file of Object.values(customFiles)) {
560
- totalTokens += file.tokens;
561
- totalPages++;
562
- if (file.error) totalErrors++;
561
+ sourceFiles = result.files;
562
+ totalTokens += result.totalTokens;
563
+ totalPages += result.totalPages;
564
+ totalErrors += result.errors;
565
+ } else if (sourceConfig.type === "custom") {
566
+ // Process custom URLs for this source
567
+ if (sourceConfig.customUrls && sourceConfig.customUrls.length > 0) {
568
+ const customFiles = await processCustomUrls(
569
+ sourceConfig.customUrls,
570
+ apiKey,
571
+ );
572
+
573
+ sourceFiles = customFiles;
574
+
575
+ for (const file of Object.values(customFiles)) {
576
+ totalTokens += file.tokens;
577
+ totalPages++;
578
+ if (file.error) totalErrors++;
579
+ }
563
580
  }
564
581
  }
565
582
 
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "extract-from-sitemap",
3
3
  "bin": "cli.js",
4
- "version": "0.0.22",
4
+ "version": "0.0.24",
5
5
  "main": "mod.js",
6
6
  "description": "A module and CLI that allows extracting all pages from a sitemap into markdown and a llms.txt, using Parallel.ai APIs.",
7
7
  "files": [
@@ -13,6 +13,6 @@
13
13
  "@cloudflare/workers-types": "4.20251011.0"
14
14
  },
15
15
  "dependencies": {
16
- "parse-llms-txt": "^0.0.9"
16
+ "parse-llms-txt": "^0.0.10"
17
17
  }
18
18
  }