extract-from-sitemap 0.0.6 → 0.0.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/cli.js +113 -79
- package/mod.js +6 -51
- package/package.json +1 -1
package/cli.js
CHANGED
|
@@ -11,17 +11,18 @@ const { extractFromSitemap } = require("./mod.js");
|
|
|
11
11
|
|
|
12
12
|
/**
|
|
13
13
|
* @typedef {Object} SourceConfig
|
|
14
|
+
* @property {string} title - The title for this source
|
|
14
15
|
* @property {string} [origin] - The origin URL to process (optional)
|
|
15
|
-
* @property {
|
|
16
|
-
* @property {
|
|
17
|
-
* @property {
|
|
16
|
+
* @property {string} [outDir] - Output directory for this source's extracted files
|
|
17
|
+
* @property {boolean} [forceExtract] - Whether to force extraction for this source
|
|
18
|
+
* @property {boolean} [keepOriginalUrls] - Whether to keep original URL structure and not save files locally
|
|
19
|
+
* @property {Array<{title: string, description: string, filename: string, url: string}>} [customUrls] - Custom URLs to extract for this source
|
|
18
20
|
*/
|
|
19
21
|
|
|
20
22
|
/**
|
|
21
23
|
* @typedef {Object} Config
|
|
22
24
|
* @property {string} outDir - Top-level output directory for combined llms.txt
|
|
23
25
|
* @property {SourceConfig[]} sources - Array of source configurations
|
|
24
|
-
* @property {boolean} keepOriginalUrls - Whether to keep original URL structure
|
|
25
26
|
*/
|
|
26
27
|
|
|
27
28
|
/**
|
|
@@ -220,28 +221,47 @@ async function loadConfig() {
|
|
|
220
221
|
outDir: "./docs",
|
|
221
222
|
sources: [
|
|
222
223
|
{
|
|
224
|
+
title: "Parallel AI Documentation",
|
|
223
225
|
origin: "https://docs.parallel.ai",
|
|
224
226
|
forceExtract: false,
|
|
225
227
|
outDir: "./docs/parallel-docs",
|
|
228
|
+
keepOriginalUrls: false,
|
|
226
229
|
},
|
|
227
230
|
{
|
|
231
|
+
title: "Parallel AI Website",
|
|
228
232
|
origin: "https://parallel.ai",
|
|
229
233
|
forceExtract: true,
|
|
230
234
|
outDir: "./docs/parallel-main",
|
|
235
|
+
keepOriginalUrls: false,
|
|
231
236
|
},
|
|
232
237
|
{
|
|
238
|
+
title: "Custom Resources",
|
|
233
239
|
forceExtract: true,
|
|
234
240
|
outDir: "./docs/custom",
|
|
241
|
+
keepOriginalUrls: false,
|
|
235
242
|
customUrls: [
|
|
236
243
|
{
|
|
237
244
|
title: "Custom Page",
|
|
238
245
|
description: "A custom page to extract",
|
|
246
|
+
filename: "custom-page",
|
|
239
247
|
url: "https://example.com/page",
|
|
240
248
|
},
|
|
241
249
|
],
|
|
242
250
|
},
|
|
251
|
+
{
|
|
252
|
+
title: "External References",
|
|
253
|
+
keepOriginalUrls: true,
|
|
254
|
+
forceExtract: false,
|
|
255
|
+
customUrls: [
|
|
256
|
+
{
|
|
257
|
+
title: "External API Guide",
|
|
258
|
+
description: "Third-party API documentation",
|
|
259
|
+
filename: "external-api",
|
|
260
|
+
url: "https://external.com/api-guide",
|
|
261
|
+
},
|
|
262
|
+
],
|
|
263
|
+
},
|
|
243
264
|
],
|
|
244
|
-
keepOriginalUrls: false,
|
|
245
265
|
},
|
|
246
266
|
null,
|
|
247
267
|
2
|
|
@@ -263,12 +283,15 @@ async function loadConfig() {
|
|
|
263
283
|
if (typeof sourceConfig !== "object" || sourceConfig === null) {
|
|
264
284
|
throw new Error(`sources[${index}] must be an object`);
|
|
265
285
|
}
|
|
266
|
-
if (!sourceConfig.
|
|
267
|
-
throw new Error(`sources[${index}].
|
|
268
|
-
}
|
|
269
|
-
if (typeof sourceConfig.forceExtract !== "boolean") {
|
|
270
|
-
throw new Error(`sources[${index}].forceExtract must be a boolean`);
|
|
286
|
+
if (!sourceConfig.title) {
|
|
287
|
+
throw new Error(`sources[${index}].title is required`);
|
|
271
288
|
}
|
|
289
|
+
|
|
290
|
+
// Set defaults
|
|
291
|
+
sourceConfig.forceExtract = sourceConfig.forceExtract ?? false;
|
|
292
|
+
sourceConfig.keepOriginalUrls = sourceConfig.keepOriginalUrls ?? false;
|
|
293
|
+
sourceConfig.customUrls = sourceConfig.customUrls || [];
|
|
294
|
+
|
|
272
295
|
// Either origin or customUrls must be provided
|
|
273
296
|
if (
|
|
274
297
|
!sourceConfig.origin &&
|
|
@@ -278,14 +301,29 @@ async function loadConfig() {
|
|
|
278
301
|
`sources[${index}] must have either origin or customUrls`
|
|
279
302
|
);
|
|
280
303
|
}
|
|
281
|
-
}
|
|
282
304
|
|
|
283
|
-
|
|
284
|
-
|
|
305
|
+
// outDir is required unless keepOriginalUrls is true
|
|
306
|
+
if (!sourceConfig.outDir && !sourceConfig.keepOriginalUrls) {
|
|
307
|
+
throw new Error(
|
|
308
|
+
`sources[${index}].outDir is required when keepOriginalUrls is false`
|
|
309
|
+
);
|
|
310
|
+
}
|
|
285
311
|
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
312
|
+
// Validate customUrls
|
|
313
|
+
for (const [urlIndex, customUrl] of (
|
|
314
|
+
sourceConfig.customUrls || []
|
|
315
|
+
).entries()) {
|
|
316
|
+
if (
|
|
317
|
+
!customUrl.title ||
|
|
318
|
+
!customUrl.description ||
|
|
319
|
+
!customUrl.filename ||
|
|
320
|
+
!customUrl.url
|
|
321
|
+
) {
|
|
322
|
+
throw new Error(
|
|
323
|
+
`sources[${index}].customUrls[${urlIndex}] must have title, description, filename, and url`
|
|
324
|
+
);
|
|
325
|
+
}
|
|
326
|
+
}
|
|
289
327
|
}
|
|
290
328
|
|
|
291
329
|
return config;
|
|
@@ -421,7 +459,7 @@ function cleanupOldFiles(outDir, currentFiles, previousFiles) {
|
|
|
421
459
|
|
|
422
460
|
/**
|
|
423
461
|
* Process custom URLs through extraction API
|
|
424
|
-
* @param {Array<{title: string, description: string, url: string}>} customUrls - Custom URLs to process
|
|
462
|
+
* @param {Array<{title: string, description: string, filename: string, url: string}>} customUrls - Custom URLs to process
|
|
425
463
|
* @param {string} apiKey - API key for authentication
|
|
426
464
|
* @returns {Promise<Record<string, any>>} Extracted files
|
|
427
465
|
*/
|
|
@@ -449,8 +487,7 @@ async function processCustomUrls(customUrls, apiKey) {
|
|
|
449
487
|
const result = await response.json();
|
|
450
488
|
if (result.results && result.results.length > 0) {
|
|
451
489
|
const extracted = result.results[0];
|
|
452
|
-
const filename =
|
|
453
|
-
customUrl.title.replace(/[^a-zA-Z0-9]/g, "_").toLowerCase() + ".md";
|
|
490
|
+
const filename = customUrl.filename + ".md";
|
|
454
491
|
|
|
455
492
|
files[filename] = {
|
|
456
493
|
content: extracted.full_content || "",
|
|
@@ -460,6 +497,7 @@ async function processCustomUrls(customUrls, apiKey) {
|
|
|
460
497
|
publishedDate: extracted.published_date || "",
|
|
461
498
|
status: 200,
|
|
462
499
|
tokens: Math.round((extracted.full_content || "").length / 5),
|
|
500
|
+
originalUrl: customUrl.url,
|
|
463
501
|
};
|
|
464
502
|
}
|
|
465
503
|
} else {
|
|
@@ -478,7 +516,7 @@ async function processCustomUrls(customUrls, apiKey) {
|
|
|
478
516
|
|
|
479
517
|
/**
|
|
480
518
|
* Generate combined llms.txt from all sources
|
|
481
|
-
* @param {Array<{
|
|
519
|
+
* @param {Array<{title: string, files: Record<string, any>, keepOriginalUrls?: boolean}>} allSources - All processed sources
|
|
482
520
|
* @returns {string} Combined llms.txt content
|
|
483
521
|
*/
|
|
484
522
|
function generateCombinedLlmsTxt(allSources) {
|
|
@@ -486,28 +524,22 @@ function generateCombinedLlmsTxt(allSources) {
|
|
|
486
524
|
"# Documentation Collection\n\n> Combined documentation from multiple sources\n\n";
|
|
487
525
|
|
|
488
526
|
for (const source of allSources) {
|
|
489
|
-
|
|
490
|
-
? new URL(
|
|
491
|
-
source.origin.startsWith("http")
|
|
492
|
-
? source.origin
|
|
493
|
-
: `https://${source.origin}`
|
|
494
|
-
).hostname
|
|
495
|
-
: source.sourceName;
|
|
496
|
-
|
|
497
|
-
combinedTxt += `## ${sourceName}\n\n`;
|
|
527
|
+
combinedTxt += `## ${source.title}\n\n`;
|
|
498
528
|
|
|
499
529
|
// Sort files by path for consistent ordering
|
|
500
|
-
const sortedFiles = Object.entries(source.files)
|
|
501
|
-
.
|
|
502
|
-
|
|
530
|
+
const sortedFiles = Object.entries(source.files).sort(([a], [b]) =>
|
|
531
|
+
a.localeCompare(b)
|
|
532
|
+
);
|
|
503
533
|
|
|
504
534
|
for (const [path, file] of sortedFiles) {
|
|
505
535
|
if (file.content || file.title) {
|
|
506
536
|
const title = file.title || path.replace(".md", "");
|
|
507
537
|
const description = file.description ? `: ${file.description}` : "";
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
538
|
+
|
|
539
|
+
// If keepOriginalUrls is true, link to the original URL, otherwise link to the local file
|
|
540
|
+
const link = source.keepOriginalUrls ? file.originalUrl : path;
|
|
541
|
+
|
|
542
|
+
combinedTxt += `- [${title}](${link}) (${file.tokens} tokens)${description}\n`;
|
|
511
543
|
}
|
|
512
544
|
}
|
|
513
545
|
|
|
@@ -560,19 +592,21 @@ async function main() {
|
|
|
560
592
|
|
|
561
593
|
// Process each source
|
|
562
594
|
for (const [sourceIndex, sourceConfig] of config.sources.entries()) {
|
|
563
|
-
const sourceName = sourceConfig.
|
|
564
|
-
? `source ${sourceIndex + 1} (${sourceConfig.origin})`
|
|
565
|
-
: `source ${sourceIndex + 1} (custom URLs)`;
|
|
595
|
+
const sourceName = `${sourceConfig.title} (source ${sourceIndex + 1})`;
|
|
566
596
|
|
|
567
597
|
console.log(
|
|
568
|
-
`\n🌐 Processing ${sourceName} (forceExtract: ${sourceConfig.forceExtract})`
|
|
598
|
+
`\n🌐 Processing ${sourceName} (forceExtract: ${sourceConfig.forceExtract}, keepOriginalUrls: ${sourceConfig.keepOriginalUrls})`
|
|
569
599
|
);
|
|
570
600
|
|
|
571
|
-
//
|
|
572
|
-
|
|
601
|
+
// Only ensure source output directory exists if not keeping original URLs
|
|
602
|
+
if (!sourceConfig.keepOriginalUrls && sourceConfig.outDir) {
|
|
603
|
+
fs.mkdirSync(sourceConfig.outDir, { recursive: true });
|
|
604
|
+
}
|
|
573
605
|
|
|
574
|
-
// Load previous manifest for this source
|
|
575
|
-
const previousManifest =
|
|
606
|
+
// Load previous manifest for this source (only if we have an outDir)
|
|
607
|
+
const previousManifest = sourceConfig.outDir
|
|
608
|
+
? loadManifest(sourceConfig.outDir)
|
|
609
|
+
: { files: [], timestamp: new Date().toISOString() };
|
|
576
610
|
const currentFiles = [];
|
|
577
611
|
let sourceFiles = {};
|
|
578
612
|
|
|
@@ -617,53 +651,53 @@ async function main() {
|
|
|
617
651
|
}
|
|
618
652
|
}
|
|
619
653
|
|
|
620
|
-
// Write files to source directory
|
|
621
|
-
|
|
622
|
-
|
|
654
|
+
// Write files to source directory (only if not keeping original URLs)
|
|
655
|
+
if (!sourceConfig.keepOriginalUrls && sourceConfig.outDir) {
|
|
656
|
+
for (const [filePath, file] of Object.entries(sourceFiles)) {
|
|
657
|
+
let filename = filePath.startsWith("/")
|
|
658
|
+
? filePath.slice(1)
|
|
659
|
+
: filePath;
|
|
623
660
|
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
filename = filePath.startsWith("/") ? filePath.slice(1) : filePath;
|
|
627
|
-
} else if (!sourceConfig.origin) {
|
|
628
|
-
// For custom URL sources, use simple filename
|
|
629
|
-
filename = filePath.startsWith("/") ? filePath.slice(1) : filePath;
|
|
630
|
-
}
|
|
661
|
+
const fullFilePath = path.join(sourceConfig.outDir, filename);
|
|
662
|
+
const fileDir = path.dirname(fullFilePath);
|
|
631
663
|
|
|
632
|
-
|
|
633
|
-
|
|
664
|
+
fs.mkdirSync(fileDir, { recursive: true });
|
|
665
|
+
fs.writeFileSync(fullFilePath, file.content);
|
|
666
|
+
currentFiles.push(filename);
|
|
634
667
|
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
668
|
+
console.log(
|
|
669
|
+
`📝 Wrote: ${path.join(sourceConfig.outDir, filename)} (${
|
|
670
|
+
file.tokens
|
|
671
|
+
} tokens)`
|
|
672
|
+
);
|
|
673
|
+
}
|
|
638
674
|
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
675
|
+
// Clean up old files for this source
|
|
676
|
+
if (previousManifest.files.length > 0) {
|
|
677
|
+
cleanupOldFiles(
|
|
678
|
+
sourceConfig.outDir,
|
|
679
|
+
currentFiles,
|
|
680
|
+
previousManifest.files
|
|
681
|
+
);
|
|
682
|
+
}
|
|
645
683
|
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
684
|
+
// Save manifest for this source
|
|
685
|
+
const newManifest = {
|
|
686
|
+
files: currentFiles,
|
|
687
|
+
timestamp: new Date().toISOString(),
|
|
688
|
+
};
|
|
689
|
+
saveManifest(sourceConfig.outDir, newManifest);
|
|
690
|
+
} else {
|
|
691
|
+
console.log(
|
|
692
|
+
`📋 Keeping original URLs - not saving files locally for ${sourceName}`
|
|
652
693
|
);
|
|
653
694
|
}
|
|
654
695
|
|
|
655
|
-
// Save manifest for this source
|
|
656
|
-
const newManifest = {
|
|
657
|
-
files: currentFiles,
|
|
658
|
-
timestamp: new Date().toISOString(),
|
|
659
|
-
};
|
|
660
|
-
saveManifest(sourceConfig.outDir, newManifest);
|
|
661
|
-
|
|
662
696
|
// Add to all sources for combined llms.txt
|
|
663
697
|
allSources.push({
|
|
664
|
-
|
|
665
|
-
origin: sourceConfig.origin,
|
|
698
|
+
title: sourceConfig.title,
|
|
666
699
|
files: sourceFiles,
|
|
700
|
+
keepOriginalUrls: sourceConfig.keepOriginalUrls,
|
|
667
701
|
});
|
|
668
702
|
} catch (error) {
|
|
669
703
|
console.error(`❌ Error processing ${sourceName}:`, error.message);
|
package/mod.js
CHANGED
|
@@ -8,6 +8,7 @@
|
|
|
8
8
|
* @property {boolean} extracted - Whether the content was extracted or directly fetched
|
|
9
9
|
* @property {number} status - HTTP status code or processing status
|
|
10
10
|
* @property {number} tokens - Number of tokens in the content
|
|
11
|
+
* @property {string} originalUrl - The original URL of the content
|
|
11
12
|
*/
|
|
12
13
|
|
|
13
14
|
/**
|
|
@@ -62,6 +63,7 @@ export async function extractFromSitemap(origin, forceExtract = false, apiKey) {
|
|
|
62
63
|
status: result.status,
|
|
63
64
|
tokens: Math.round(result.content.length / 5),
|
|
64
65
|
publishedDate: result.publishedDate || "",
|
|
66
|
+
originalUrl: urlStr,
|
|
65
67
|
error: result.error,
|
|
66
68
|
};
|
|
67
69
|
|
|
@@ -80,6 +82,7 @@ export async function extractFromSitemap(origin, forceExtract = false, apiKey) {
|
|
|
80
82
|
status: 0,
|
|
81
83
|
tokens: 0,
|
|
82
84
|
publishedDate: "",
|
|
85
|
+
originalUrl: urlStr,
|
|
83
86
|
};
|
|
84
87
|
if (!forceExtract) {
|
|
85
88
|
urlsNeedingExtract.push(urlStr);
|
|
@@ -108,6 +111,7 @@ export async function extractFromSitemap(origin, forceExtract = false, apiKey) {
|
|
|
108
111
|
status: 0,
|
|
109
112
|
tokens: 0,
|
|
110
113
|
publishedDate: "",
|
|
114
|
+
originalUrl: result.url,
|
|
111
115
|
};
|
|
112
116
|
|
|
113
117
|
const content = result.full_content || existing.content;
|
|
@@ -122,6 +126,7 @@ export async function extractFromSitemap(origin, forceExtract = false, apiKey) {
|
|
|
122
126
|
publishedDate: result.published_date || existing.publishedDate,
|
|
123
127
|
status: existing.status,
|
|
124
128
|
tokens: Math.round(content.length / 5),
|
|
129
|
+
originalUrl: existing.originalUrl,
|
|
125
130
|
};
|
|
126
131
|
}
|
|
127
132
|
|
|
@@ -137,18 +142,6 @@ export async function extractFromSitemap(origin, forceExtract = false, apiKey) {
|
|
|
137
142
|
}
|
|
138
143
|
}
|
|
139
144
|
|
|
140
|
-
// Generate llms.txt
|
|
141
|
-
const llmsTxt = generateLlmsTxt(origin, files);
|
|
142
|
-
files["/llms.txt"] = {
|
|
143
|
-
content: llmsTxt,
|
|
144
|
-
title: "LLMs.txt",
|
|
145
|
-
description: "LLM-friendly content listing",
|
|
146
|
-
extracted: false,
|
|
147
|
-
publishedDate: "",
|
|
148
|
-
status: 200,
|
|
149
|
-
tokens: Math.round(llmsTxt.length / 5),
|
|
150
|
-
};
|
|
151
|
-
|
|
152
145
|
// Sort files by path
|
|
153
146
|
const sortedFiles = Object.keys(files)
|
|
154
147
|
.sort()
|
|
@@ -162,7 +155,7 @@ export async function extractFromSitemap(origin, forceExtract = false, apiKey) {
|
|
|
162
155
|
(sum, file) => sum + file.tokens,
|
|
163
156
|
0
|
|
164
157
|
);
|
|
165
|
-
const totalPages = Object.keys(sortedFiles).length
|
|
158
|
+
const totalPages = Object.keys(sortedFiles).length;
|
|
166
159
|
const errors = Object.values(sortedFiles).filter((file) => file.error).length;
|
|
167
160
|
const processingTimeMs = Date.now() - startTime;
|
|
168
161
|
|
|
@@ -527,44 +520,6 @@ function getPathFromUrl(urlStr) {
|
|
|
527
520
|
}
|
|
528
521
|
}
|
|
529
522
|
|
|
530
|
-
/**
|
|
531
|
-
* Generate llms.txt content
|
|
532
|
-
* @param {string} origin - Site origin
|
|
533
|
-
* @param {Record<string, any>} files - Files object
|
|
534
|
-
* @returns {string} Generated llms.txt content
|
|
535
|
-
*/
|
|
536
|
-
function generateLlmsTxt(origin, files) {
|
|
537
|
-
// Find homepage for top-level description
|
|
538
|
-
const homepageFile = files["/index.html.md"] || files[Object.keys(files)[0]];
|
|
539
|
-
const siteTitle =
|
|
540
|
-
homepageFile?.title ||
|
|
541
|
-
new URL(origin.startsWith("http") ? origin : `https://${origin}`).hostname;
|
|
542
|
-
const siteDescription =
|
|
543
|
-
homepageFile?.description || `Documentation for ${siteTitle}`;
|
|
544
|
-
|
|
545
|
-
let llmsTxt = `# ${siteTitle}\n\n> ${siteDescription}\n\n`;
|
|
546
|
-
|
|
547
|
-
// Add documentation section
|
|
548
|
-
llmsTxt += "## Documentation\n\n";
|
|
549
|
-
|
|
550
|
-
// Sort files by path for consistent ordering
|
|
551
|
-
const sortedFiles = Object.entries(files)
|
|
552
|
-
.filter(([path]) => path !== "/llms.txt")
|
|
553
|
-
.sort(([a], [b]) => a.localeCompare(b));
|
|
554
|
-
|
|
555
|
-
for (const [path, file] of sortedFiles) {
|
|
556
|
-
if (file.content || file.title) {
|
|
557
|
-
const title = file.title || path.replace(".md", "");
|
|
558
|
-
const description = file.description ? `: ${file.description}` : "";
|
|
559
|
-
llmsTxt += `- [${title}](${path.replace(".md", "")}) (${
|
|
560
|
-
file.tokens
|
|
561
|
-
} tokens)${description}\n`;
|
|
562
|
-
}
|
|
563
|
-
}
|
|
564
|
-
|
|
565
|
-
return llmsTxt;
|
|
566
|
-
}
|
|
567
|
-
|
|
568
523
|
/**
|
|
569
524
|
* Call Parallel Extract API for multiple URLs
|
|
570
525
|
* @param {string[]} urls - URLs to extract
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "extract-from-sitemap",
|
|
3
3
|
"bin": "cli.js",
|
|
4
|
-
"version": "0.0.
|
|
4
|
+
"version": "0.0.8",
|
|
5
5
|
"main": "mod.js",
|
|
6
6
|
"description": "A module and CLI that allows extracting all pages from a sitemap into markdown and a llms.txt, using Parallel.ai APIs.",
|
|
7
7
|
"files": [
|