extract-from-sitemap 0.0.23 → 0.0.24
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/cli.js +23 -3
- package/mod.js +58 -41
- package/package.json +1 -1
package/cli.js
CHANGED
|
@@ -222,6 +222,7 @@ async function loadConfig() {
|
|
|
222
222
|
outDir: "./docs",
|
|
223
223
|
sources: [
|
|
224
224
|
{
|
|
225
|
+
type: "llms.txt",
|
|
225
226
|
title: "Parallel AI Documentation",
|
|
226
227
|
origin: "https://docs.parallel.ai",
|
|
227
228
|
forceExtract: false,
|
|
@@ -229,6 +230,7 @@ async function loadConfig() {
|
|
|
229
230
|
keepOriginalUrls: false,
|
|
230
231
|
},
|
|
231
232
|
{
|
|
233
|
+
type: "sitemap",
|
|
232
234
|
title: "Parallel AI Website",
|
|
233
235
|
origin: "https://parallel.ai",
|
|
234
236
|
forceExtract: true,
|
|
@@ -236,6 +238,7 @@ async function loadConfig() {
|
|
|
236
238
|
keepOriginalUrls: false,
|
|
237
239
|
},
|
|
238
240
|
{
|
|
241
|
+
type: "custom",
|
|
239
242
|
title: "Custom Resources",
|
|
240
243
|
forceExtract: true,
|
|
241
244
|
outDir: "./docs/custom",
|
|
@@ -250,6 +253,7 @@ async function loadConfig() {
|
|
|
250
253
|
],
|
|
251
254
|
},
|
|
252
255
|
{
|
|
256
|
+
type: "custom",
|
|
253
257
|
title: "External References",
|
|
254
258
|
keepOriginalUrls: true,
|
|
255
259
|
forceExtract: false,
|
|
@@ -291,6 +295,14 @@ async function loadConfig() {
|
|
|
291
295
|
if (!sourceConfig.title) {
|
|
292
296
|
throw new Error(`sources[${index}].title is required`);
|
|
293
297
|
}
|
|
298
|
+
if (!sourceConfig.type) {
|
|
299
|
+
throw new Error(`sources[${index}].type is required`);
|
|
300
|
+
}
|
|
301
|
+
if (!["llms.txt", "sitemap", "custom"].includes(sourceConfig.type)) {
|
|
302
|
+
throw new Error(
|
|
303
|
+
`sources[${index}].type must be one of: llms.txt, sitemap, custom`
|
|
304
|
+
);
|
|
305
|
+
}
|
|
294
306
|
|
|
295
307
|
// Set defaults
|
|
296
308
|
sourceConfig.forceExtract = sourceConfig.forceExtract ?? false;
|
|
@@ -313,13 +325,21 @@ async function loadConfig() {
|
|
|
313
325
|
}
|
|
314
326
|
}
|
|
315
327
|
|
|
316
|
-
//
|
|
328
|
+
// Validate type-specific requirements
|
|
329
|
+
if (
|
|
330
|
+
(sourceConfig.type === "llms.txt" || sourceConfig.type === "sitemap") &&
|
|
331
|
+
!sourceConfig.origin
|
|
332
|
+
) {
|
|
333
|
+
throw new Error(
|
|
334
|
+
`sources[${index}] with type '${sourceConfig.type}' requires origin`
|
|
335
|
+
);
|
|
336
|
+
}
|
|
317
337
|
if (
|
|
318
|
-
|
|
338
|
+
sourceConfig.type === "custom" &&
|
|
319
339
|
(!sourceConfig.customUrls || sourceConfig.customUrls.length === 0)
|
|
320
340
|
) {
|
|
321
341
|
throw new Error(
|
|
322
|
-
`sources[${index}]
|
|
342
|
+
`sources[${index}] with type 'custom' requires customUrls with at least one entry`
|
|
323
343
|
);
|
|
324
344
|
}
|
|
325
345
|
|
package/mod.js
CHANGED
|
@@ -26,8 +26,9 @@ import { parseLlmsTxt } from "parse-llms-txt";
|
|
|
26
26
|
|
|
27
27
|
/**
|
|
28
28
|
* @typedef {Object} SourceConfig
|
|
29
|
+
* @property {"llms.txt" | "sitemap" | "custom"} type - The source type
|
|
29
30
|
* @property {string} title - The title for this source
|
|
30
|
-
* @property {string} [origin] - The origin URL to process (
|
|
31
|
+
* @property {string} [origin] - The origin URL to process (required for llms.txt and sitemap types)
|
|
31
32
|
* @property {string} [outDir] - Output directory for this source's extracted files
|
|
32
33
|
* @property {boolean} [forceExtract] - Whether to force extraction for this source
|
|
33
34
|
* @property {boolean} [keepOriginalUrls] - Whether to keep original URL structure and not save files locally
|
|
@@ -278,8 +279,34 @@ async function extractFromLlmsTxtEntries(
|
|
|
278
279
|
}
|
|
279
280
|
|
|
280
281
|
/**
|
|
281
|
-
* Extract content from
|
|
282
|
-
*
|
|
282
|
+
* Extract content from llms.txt only (no fallback to sitemap)
|
|
283
|
+
* @param {string} origin - The origin URL to extract from
|
|
284
|
+
* @param {string} apiKey - Parallel API key
|
|
285
|
+
* @param {string} [titleRemovePattern] - Optional regex pattern to remove from titles
|
|
286
|
+
* @returns {Promise<ResponseData>}
|
|
287
|
+
*/
|
|
288
|
+
export async function extractFromLlmsTxt(origin, apiKey, titleRemovePattern) {
|
|
289
|
+
const llmsTxtContent = await fetchLlmsTxt(origin);
|
|
290
|
+
if (!llmsTxtContent) {
|
|
291
|
+
throw new Error(`Could not find llms.txt for ${origin}`);
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
const llmsTxt = parseLlmsTxt(llmsTxtContent);
|
|
295
|
+
const totalEntries = llmsTxt.sections.reduce(
|
|
296
|
+
(sum, section) => sum + section.files.length,
|
|
297
|
+
0,
|
|
298
|
+
);
|
|
299
|
+
|
|
300
|
+
if (totalEntries === 0) {
|
|
301
|
+
throw new Error(`llms.txt found but contains no entries for ${origin}`);
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
console.log(`Found llms.txt with ${totalEntries} entries for ${origin}`);
|
|
305
|
+
return extractFromLlmsTxtEntries(llmsTxt, origin, apiKey, titleRemovePattern);
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
/**
|
|
309
|
+
* Extract content from sitemap URLs with markdown variant detection (no llms.txt fallback)
|
|
283
310
|
* @param {string} origin - The origin URL to extract from
|
|
284
311
|
* @param {boolean} forceExtract - Whether to force using extract API instead of markdown variants
|
|
285
312
|
* @param {string} apiKey - Parallel API key
|
|
@@ -296,29 +323,9 @@ export async function extractFromSitemap(
|
|
|
296
323
|
let fetchCount = 0;
|
|
297
324
|
let extractApiCallCount = 0;
|
|
298
325
|
|
|
299
|
-
// Try llms.txt first
|
|
300
|
-
const llmsTxtContent = await fetchLlmsTxt(origin);
|
|
301
|
-
if (llmsTxtContent) {
|
|
302
|
-
const llmsTxt = parseLlmsTxt(llmsTxtContent);
|
|
303
|
-
const totalEntries = llmsTxt.sections.reduce(
|
|
304
|
-
(sum, section) => sum + section.files.length,
|
|
305
|
-
0,
|
|
306
|
-
);
|
|
307
|
-
if (totalEntries > 0) {
|
|
308
|
-
console.log(`Found llms.txt with ${totalEntries} entries for ${origin}`);
|
|
309
|
-
return extractFromLlmsTxtEntries(
|
|
310
|
-
llmsTxt,
|
|
311
|
-
origin,
|
|
312
|
-
apiKey,
|
|
313
|
-
titleRemovePattern,
|
|
314
|
-
);
|
|
315
|
-
}
|
|
316
|
-
}
|
|
317
|
-
|
|
318
|
-
// Fall back to sitemap discovery
|
|
319
326
|
const sitemapUrl = await discoverSitemap(origin);
|
|
320
327
|
if (!sitemapUrl) {
|
|
321
|
-
throw new Error(`Could not find sitemap
|
|
328
|
+
throw new Error(`Could not find sitemap for ${origin}`);
|
|
322
329
|
}
|
|
323
330
|
|
|
324
331
|
// Parse sitemap and get URLs
|
|
@@ -531,11 +538,10 @@ export async function processLLMTextConfig(config, apiKey) {
|
|
|
531
538
|
let sourceFiles = {};
|
|
532
539
|
|
|
533
540
|
try {
|
|
534
|
-
// Process
|
|
535
|
-
if (sourceConfig.origin) {
|
|
536
|
-
const result = await
|
|
541
|
+
// Process based on type
|
|
542
|
+
if (sourceConfig.type === "llms.txt" && sourceConfig.origin) {
|
|
543
|
+
const result = await extractFromLlmsTxt(
|
|
537
544
|
sourceConfig.origin,
|
|
538
|
-
sourceConfig.forceExtract || false,
|
|
539
545
|
apiKey,
|
|
540
546
|
sourceConfig.titleRemovePattern,
|
|
541
547
|
);
|
|
@@ -544,22 +550,33 @@ export async function processLLMTextConfig(config, apiKey) {
|
|
|
544
550
|
totalTokens += result.totalTokens;
|
|
545
551
|
totalPages += result.totalPages;
|
|
546
552
|
totalErrors += result.errors;
|
|
547
|
-
}
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
const customFiles = await processCustomUrls(
|
|
552
|
-
sourceConfig.customUrls,
|
|
553
|
+
} else if (sourceConfig.type === "sitemap" && sourceConfig.origin) {
|
|
554
|
+
const result = await extractFromSitemap(
|
|
555
|
+
sourceConfig.origin,
|
|
556
|
+
sourceConfig.forceExtract || false,
|
|
553
557
|
apiKey,
|
|
558
|
+
sourceConfig.titleRemovePattern,
|
|
554
559
|
);
|
|
555
560
|
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
561
|
+
sourceFiles = result.files;
|
|
562
|
+
totalTokens += result.totalTokens;
|
|
563
|
+
totalPages += result.totalPages;
|
|
564
|
+
totalErrors += result.errors;
|
|
565
|
+
} else if (sourceConfig.type === "custom") {
|
|
566
|
+
// Process custom URLs for this source
|
|
567
|
+
if (sourceConfig.customUrls && sourceConfig.customUrls.length > 0) {
|
|
568
|
+
const customFiles = await processCustomUrls(
|
|
569
|
+
sourceConfig.customUrls,
|
|
570
|
+
apiKey,
|
|
571
|
+
);
|
|
572
|
+
|
|
573
|
+
sourceFiles = customFiles;
|
|
574
|
+
|
|
575
|
+
for (const file of Object.values(customFiles)) {
|
|
576
|
+
totalTokens += file.tokens;
|
|
577
|
+
totalPages++;
|
|
578
|
+
if (file.error) totalErrors++;
|
|
579
|
+
}
|
|
563
580
|
}
|
|
564
581
|
}
|
|
565
582
|
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "extract-from-sitemap",
|
|
3
3
|
"bin": "cli.js",
|
|
4
|
-
"version": "0.0.
|
|
4
|
+
"version": "0.0.24",
|
|
5
5
|
"main": "mod.js",
|
|
6
6
|
"description": "A module and CLI that allows extracting all pages from a sitemap into markdown and a llms.txt, using Parallel.ai APIs.",
|
|
7
7
|
"files": [
|