extract-from-sitemap 0.0.13 ā 0.0.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/cli.js +19 -85
- package/package.json +1 -1
package/cli.js
CHANGED
|
@@ -21,16 +21,12 @@ const { extractFromSitemap } = require("./mod.js");
|
|
|
21
21
|
*/
|
|
22
22
|
/**
|
|
23
23
|
* @typedef {Object} Config
|
|
24
|
+
* @property {string} description - Description of the documentation collection
|
|
25
|
+
* @property {string} [details] - Optional additional details about the collection
|
|
24
26
|
* @property {string} outDir - Top-level output directory for combined llms.txt
|
|
25
27
|
* @property {SourceConfig[]} sources - Array of source configurations
|
|
26
28
|
*/
|
|
27
29
|
|
|
28
|
-
/**
|
|
29
|
-
* @typedef {Object} Manifest
|
|
30
|
-
* @property {string[]} files - List of generated files
|
|
31
|
-
* @property {string} timestamp - Timestamp of last generation
|
|
32
|
-
*/
|
|
33
|
-
|
|
34
30
|
const CREDENTIALS_DIR = path.join(os.homedir(), ".llmtext");
|
|
35
31
|
const API_KEY_FILE = path.join(CREDENTIALS_DIR, "api-key");
|
|
36
32
|
|
|
@@ -218,6 +214,9 @@ async function loadConfig() {
|
|
|
218
214
|
JSON.stringify(
|
|
219
215
|
{
|
|
220
216
|
$schema: "https://extract.llmtext.com/llmtext.schema.json",
|
|
217
|
+
description: "Combined documentation from multiple sources",
|
|
218
|
+
details:
|
|
219
|
+
"This collection includes API documentation, guides, and references.",
|
|
221
220
|
outDir: "./docs",
|
|
222
221
|
sources: [
|
|
223
222
|
{
|
|
@@ -274,6 +273,7 @@ async function loadConfig() {
|
|
|
274
273
|
const config = JSON.parse(fs.readFileSync(configPath, "utf8"));
|
|
275
274
|
|
|
276
275
|
// Validate required fields
|
|
276
|
+
if (!config.description) throw new Error("description is required");
|
|
277
277
|
if (!config.outDir) throw new Error("outDir is required");
|
|
278
278
|
if (!Array.isArray(config.sources))
|
|
279
279
|
throw new Error("sources must be an array");
|
|
@@ -416,59 +416,6 @@ async function getApiKey() {
|
|
|
416
416
|
return newApiKey;
|
|
417
417
|
}
|
|
418
418
|
|
|
419
|
-
/**
|
|
420
|
-
* Load manifest file
|
|
421
|
-
* @param {string} outDir - Output directory
|
|
422
|
-
* @returns {Manifest} The manifest object
|
|
423
|
-
*/
|
|
424
|
-
function loadManifest(outDir) {
|
|
425
|
-
const manifestPath = path.join(outDir, "llmtext-manifest.json");
|
|
426
|
-
|
|
427
|
-
if (!fs.existsSync(manifestPath)) {
|
|
428
|
-
return { files: [], timestamp: new Date().toISOString() };
|
|
429
|
-
}
|
|
430
|
-
|
|
431
|
-
try {
|
|
432
|
-
return JSON.parse(fs.readFileSync(manifestPath, "utf8"));
|
|
433
|
-
} catch {
|
|
434
|
-
return { files: [], timestamp: new Date().toISOString() };
|
|
435
|
-
}
|
|
436
|
-
}
|
|
437
|
-
|
|
438
|
-
/**
|
|
439
|
-
* Save manifest file
|
|
440
|
-
* @param {string} outDir - Output directory
|
|
441
|
-
* @param {Manifest} manifest - The manifest to save
|
|
442
|
-
*/
|
|
443
|
-
function saveManifest(outDir, manifest) {
|
|
444
|
-
const manifestPath = path.join(outDir, "llmtext-manifest.json");
|
|
445
|
-
fs.writeFileSync(manifestPath, JSON.stringify(manifest, null, 2));
|
|
446
|
-
}
|
|
447
|
-
|
|
448
|
-
/**
|
|
449
|
-
* Clean up old files that are no longer generated
|
|
450
|
-
* @param {string} outDir - Output directory
|
|
451
|
-
* @param {string[]} currentFiles - Currently generated files
|
|
452
|
-
* @param {string[]} previousFiles - Previously generated files
|
|
453
|
-
*/
|
|
454
|
-
function cleanupOldFiles(outDir, currentFiles, previousFiles) {
|
|
455
|
-
const filesToRemove = previousFiles.filter(
|
|
456
|
-
(file) => !currentFiles.includes(file)
|
|
457
|
-
);
|
|
458
|
-
|
|
459
|
-
for (const file of filesToRemove) {
|
|
460
|
-
const filePath = path.join(outDir, file);
|
|
461
|
-
try {
|
|
462
|
-
if (fs.existsSync(filePath)) {
|
|
463
|
-
fs.rmSync(filePath);
|
|
464
|
-
console.log(`šļø Removed old file: ${file}`);
|
|
465
|
-
}
|
|
466
|
-
} catch (error) {
|
|
467
|
-
console.warn(`ā ļø Could not remove ${file}:`, error.message);
|
|
468
|
-
}
|
|
469
|
-
}
|
|
470
|
-
}
|
|
471
|
-
|
|
472
419
|
/**
|
|
473
420
|
* Process custom URLs through extraction API
|
|
474
421
|
* @param {Array<{title: string, description: string, filename: string, url: string}>} customUrls - Custom URLs to process
|
|
@@ -546,12 +493,17 @@ function getPathPrefix(topLevelOutDir, sourceOutDir) {
|
|
|
546
493
|
|
|
547
494
|
/**
|
|
548
495
|
* Generate combined llms.txt from all sources
|
|
496
|
+
* @param {string} description - Top-level description
|
|
497
|
+
* @param {string} [details] - Optional top-level details
|
|
549
498
|
* @param {Array<{title: string, files: Record<string, any>, keepOriginalUrls?: boolean, pathPrefix: string}>} allSources - All processed sources
|
|
550
499
|
* @returns {string} Combined llms.txt content
|
|
551
500
|
*/
|
|
552
|
-
function generateCombinedLlmsTxt(allSources) {
|
|
553
|
-
let combinedTxt =
|
|
554
|
-
|
|
501
|
+
function generateCombinedLlmsTxt(description, details, allSources) {
|
|
502
|
+
let combinedTxt = `# Documentation Collection\n\n> ${description}\n\n`;
|
|
503
|
+
|
|
504
|
+
if (details) {
|
|
505
|
+
combinedTxt += `${details}\n\n`;
|
|
506
|
+
}
|
|
555
507
|
|
|
556
508
|
for (const source of allSources) {
|
|
557
509
|
combinedTxt += `## ${source.title}\n\n`;
|
|
@@ -641,11 +593,6 @@ async function main() {
|
|
|
641
593
|
fs.mkdirSync(sourceConfig.outDir, { recursive: true });
|
|
642
594
|
}
|
|
643
595
|
|
|
644
|
-
// Load previous manifest for this source (only if we have an outDir and not keeping original URLs)
|
|
645
|
-
const previousManifest = !sourceConfig.keepOriginalUrls
|
|
646
|
-
? loadManifest(sourceConfig.outDir)
|
|
647
|
-
: { files: [], timestamp: new Date().toISOString() };
|
|
648
|
-
const currentFiles = [];
|
|
649
596
|
let sourceFiles = {};
|
|
650
597
|
|
|
651
598
|
try {
|
|
@@ -702,7 +649,6 @@ async function main() {
|
|
|
702
649
|
|
|
703
650
|
fs.mkdirSync(fileDir, { recursive: true });
|
|
704
651
|
fs.writeFileSync(fullFilePath, file.content);
|
|
705
|
-
currentFiles.push(filename);
|
|
706
652
|
|
|
707
653
|
console.log(
|
|
708
654
|
`š Wrote: ${path.join(sourceConfig.outDir, filename)} (${
|
|
@@ -710,22 +656,6 @@ async function main() {
|
|
|
710
656
|
} tokens)`
|
|
711
657
|
);
|
|
712
658
|
}
|
|
713
|
-
|
|
714
|
-
// Clean up old files for this source
|
|
715
|
-
if (previousManifest.files.length > 0) {
|
|
716
|
-
cleanupOldFiles(
|
|
717
|
-
sourceConfig.outDir,
|
|
718
|
-
currentFiles,
|
|
719
|
-
previousManifest.files
|
|
720
|
-
);
|
|
721
|
-
}
|
|
722
|
-
|
|
723
|
-
// Save manifest for this source
|
|
724
|
-
const newManifest = {
|
|
725
|
-
files: currentFiles,
|
|
726
|
-
timestamp: new Date().toISOString(),
|
|
727
|
-
};
|
|
728
|
-
saveManifest(sourceConfig.outDir, newManifest);
|
|
729
659
|
} else {
|
|
730
660
|
console.log(
|
|
731
661
|
`š Keeping original URLs - not saving files locally for ${sourceName}`
|
|
@@ -752,7 +682,11 @@ async function main() {
|
|
|
752
682
|
|
|
753
683
|
// Generate and write combined llms.txt to top-level outDir
|
|
754
684
|
if (allSources.length > 0) {
|
|
755
|
-
const combinedLlmsTxt = generateCombinedLlmsTxt(
|
|
685
|
+
const combinedLlmsTxt = generateCombinedLlmsTxt(
|
|
686
|
+
config.description,
|
|
687
|
+
config.details,
|
|
688
|
+
allSources
|
|
689
|
+
);
|
|
756
690
|
const combinedLlmsTxtPath = path.join(config.outDir, "llms.txt");
|
|
757
691
|
fs.writeFileSync(combinedLlmsTxtPath, combinedLlmsTxt);
|
|
758
692
|
console.log(`\nš Generated combined llms.txt: ${combinedLlmsTxtPath}`);
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "extract-from-sitemap",
|
|
3
3
|
"bin": "cli.js",
|
|
4
|
-
"version": "0.0.
|
|
4
|
+
"version": "0.0.14",
|
|
5
5
|
"main": "mod.js",
|
|
6
6
|
"description": "A module and CLI that allows extracting all pages from a sitemap into markdown and a llms.txt, using Parallel.ai APIs.",
|
|
7
7
|
"files": [
|