extract-from-sitemap 0.0.13 ā 0.0.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/cli.js +23 -85
- package/package.json +1 -1
package/cli.js
CHANGED
|
@@ -21,16 +21,13 @@ const { extractFromSitemap } = require("./mod.js");
|
|
|
21
21
|
*/
|
|
22
22
|
/**
|
|
23
23
|
* @typedef {Object} Config
|
|
24
|
+
* @property {string} title - Title of your document
|
|
25
|
+
* @property {string} description - Description of the documentation collection
|
|
26
|
+
* @property {string} [details] - Optional additional details about the collection
|
|
24
27
|
* @property {string} outDir - Top-level output directory for combined llms.txt
|
|
25
28
|
* @property {SourceConfig[]} sources - Array of source configurations
|
|
26
29
|
*/
|
|
27
30
|
|
|
28
|
-
/**
|
|
29
|
-
* @typedef {Object} Manifest
|
|
30
|
-
* @property {string[]} files - List of generated files
|
|
31
|
-
* @property {string} timestamp - Timestamp of last generation
|
|
32
|
-
*/
|
|
33
|
-
|
|
34
31
|
const CREDENTIALS_DIR = path.join(os.homedir(), ".llmtext");
|
|
35
32
|
const API_KEY_FILE = path.join(CREDENTIALS_DIR, "api-key");
|
|
36
33
|
|
|
@@ -218,6 +215,10 @@ async function loadConfig() {
|
|
|
218
215
|
JSON.stringify(
|
|
219
216
|
{
|
|
220
217
|
$schema: "https://extract.llmtext.com/llmtext.schema.json",
|
|
218
|
+
title: "Parallel Web Systems",
|
|
219
|
+
description: "Combined documentation from multiple sources",
|
|
220
|
+
details:
|
|
221
|
+
"This collection includes API documentation, guides, and references.",
|
|
221
222
|
outDir: "./docs",
|
|
222
223
|
sources: [
|
|
223
224
|
{
|
|
@@ -274,6 +275,7 @@ async function loadConfig() {
|
|
|
274
275
|
const config = JSON.parse(fs.readFileSync(configPath, "utf8"));
|
|
275
276
|
|
|
276
277
|
// Validate required fields
|
|
278
|
+
if (!config.description) throw new Error("description is required");
|
|
277
279
|
if (!config.outDir) throw new Error("outDir is required");
|
|
278
280
|
if (!Array.isArray(config.sources))
|
|
279
281
|
throw new Error("sources must be an array");
|
|
@@ -416,59 +418,6 @@ async function getApiKey() {
|
|
|
416
418
|
return newApiKey;
|
|
417
419
|
}
|
|
418
420
|
|
|
419
|
-
/**
|
|
420
|
-
* Load manifest file
|
|
421
|
-
* @param {string} outDir - Output directory
|
|
422
|
-
* @returns {Manifest} The manifest object
|
|
423
|
-
*/
|
|
424
|
-
function loadManifest(outDir) {
|
|
425
|
-
const manifestPath = path.join(outDir, "llmtext-manifest.json");
|
|
426
|
-
|
|
427
|
-
if (!fs.existsSync(manifestPath)) {
|
|
428
|
-
return { files: [], timestamp: new Date().toISOString() };
|
|
429
|
-
}
|
|
430
|
-
|
|
431
|
-
try {
|
|
432
|
-
return JSON.parse(fs.readFileSync(manifestPath, "utf8"));
|
|
433
|
-
} catch {
|
|
434
|
-
return { files: [], timestamp: new Date().toISOString() };
|
|
435
|
-
}
|
|
436
|
-
}
|
|
437
|
-
|
|
438
|
-
/**
|
|
439
|
-
* Save manifest file
|
|
440
|
-
* @param {string} outDir - Output directory
|
|
441
|
-
* @param {Manifest} manifest - The manifest to save
|
|
442
|
-
*/
|
|
443
|
-
function saveManifest(outDir, manifest) {
|
|
444
|
-
const manifestPath = path.join(outDir, "llmtext-manifest.json");
|
|
445
|
-
fs.writeFileSync(manifestPath, JSON.stringify(manifest, null, 2));
|
|
446
|
-
}
|
|
447
|
-
|
|
448
|
-
/**
|
|
449
|
-
* Clean up old files that are no longer generated
|
|
450
|
-
* @param {string} outDir - Output directory
|
|
451
|
-
* @param {string[]} currentFiles - Currently generated files
|
|
452
|
-
* @param {string[]} previousFiles - Previously generated files
|
|
453
|
-
*/
|
|
454
|
-
function cleanupOldFiles(outDir, currentFiles, previousFiles) {
|
|
455
|
-
const filesToRemove = previousFiles.filter(
|
|
456
|
-
(file) => !currentFiles.includes(file)
|
|
457
|
-
);
|
|
458
|
-
|
|
459
|
-
for (const file of filesToRemove) {
|
|
460
|
-
const filePath = path.join(outDir, file);
|
|
461
|
-
try {
|
|
462
|
-
if (fs.existsSync(filePath)) {
|
|
463
|
-
fs.rmSync(filePath);
|
|
464
|
-
console.log(`šļø Removed old file: ${file}`);
|
|
465
|
-
}
|
|
466
|
-
} catch (error) {
|
|
467
|
-
console.warn(`ā ļø Could not remove ${file}:`, error.message);
|
|
468
|
-
}
|
|
469
|
-
}
|
|
470
|
-
}
|
|
471
|
-
|
|
472
421
|
/**
|
|
473
422
|
* Process custom URLs through extraction API
|
|
474
423
|
* @param {Array<{title: string, description: string, filename: string, url: string}>} customUrls - Custom URLs to process
|
|
@@ -546,12 +495,18 @@ function getPathPrefix(topLevelOutDir, sourceOutDir) {
|
|
|
546
495
|
|
|
547
496
|
/**
|
|
548
497
|
* Generate combined llms.txt from all sources
|
|
498
|
+
* @param {string} title - Top-level title
|
|
499
|
+
* @param {string} description - Top-level description
|
|
500
|
+
* @param {string} [details] - Optional top-level details
|
|
549
501
|
* @param {Array<{title: string, files: Record<string, any>, keepOriginalUrls?: boolean, pathPrefix: string}>} allSources - All processed sources
|
|
550
502
|
* @returns {string} Combined llms.txt content
|
|
551
503
|
*/
|
|
552
|
-
function generateCombinedLlmsTxt(allSources) {
|
|
553
|
-
let combinedTxt =
|
|
554
|
-
|
|
504
|
+
function generateCombinedLlmsTxt(title, description, details, allSources) {
|
|
505
|
+
let combinedTxt = `# Documentation Collection\n\n> ${description}\n\n`;
|
|
506
|
+
|
|
507
|
+
if (details) {
|
|
508
|
+
combinedTxt += `${details}\n\n`;
|
|
509
|
+
}
|
|
555
510
|
|
|
556
511
|
for (const source of allSources) {
|
|
557
512
|
combinedTxt += `## ${source.title}\n\n`;
|
|
@@ -641,11 +596,6 @@ async function main() {
|
|
|
641
596
|
fs.mkdirSync(sourceConfig.outDir, { recursive: true });
|
|
642
597
|
}
|
|
643
598
|
|
|
644
|
-
// Load previous manifest for this source (only if we have an outDir and not keeping original URLs)
|
|
645
|
-
const previousManifest = !sourceConfig.keepOriginalUrls
|
|
646
|
-
? loadManifest(sourceConfig.outDir)
|
|
647
|
-
: { files: [], timestamp: new Date().toISOString() };
|
|
648
|
-
const currentFiles = [];
|
|
649
599
|
let sourceFiles = {};
|
|
650
600
|
|
|
651
601
|
try {
|
|
@@ -702,7 +652,6 @@ async function main() {
|
|
|
702
652
|
|
|
703
653
|
fs.mkdirSync(fileDir, { recursive: true });
|
|
704
654
|
fs.writeFileSync(fullFilePath, file.content);
|
|
705
|
-
currentFiles.push(filename);
|
|
706
655
|
|
|
707
656
|
console.log(
|
|
708
657
|
`š Wrote: ${path.join(sourceConfig.outDir, filename)} (${
|
|
@@ -710,22 +659,6 @@ async function main() {
|
|
|
710
659
|
} tokens)`
|
|
711
660
|
);
|
|
712
661
|
}
|
|
713
|
-
|
|
714
|
-
// Clean up old files for this source
|
|
715
|
-
if (previousManifest.files.length > 0) {
|
|
716
|
-
cleanupOldFiles(
|
|
717
|
-
sourceConfig.outDir,
|
|
718
|
-
currentFiles,
|
|
719
|
-
previousManifest.files
|
|
720
|
-
);
|
|
721
|
-
}
|
|
722
|
-
|
|
723
|
-
// Save manifest for this source
|
|
724
|
-
const newManifest = {
|
|
725
|
-
files: currentFiles,
|
|
726
|
-
timestamp: new Date().toISOString(),
|
|
727
|
-
};
|
|
728
|
-
saveManifest(sourceConfig.outDir, newManifest);
|
|
729
662
|
} else {
|
|
730
663
|
console.log(
|
|
731
664
|
`š Keeping original URLs - not saving files locally for ${sourceName}`
|
|
@@ -752,7 +685,12 @@ async function main() {
|
|
|
752
685
|
|
|
753
686
|
// Generate and write combined llms.txt to top-level outDir
|
|
754
687
|
if (allSources.length > 0) {
|
|
755
|
-
const combinedLlmsTxt = generateCombinedLlmsTxt(
|
|
688
|
+
const combinedLlmsTxt = generateCombinedLlmsTxt(
|
|
689
|
+
config.title,
|
|
690
|
+
config.description,
|
|
691
|
+
config.details,
|
|
692
|
+
allSources
|
|
693
|
+
);
|
|
756
694
|
const combinedLlmsTxtPath = path.join(config.outDir, "llms.txt");
|
|
757
695
|
fs.writeFileSync(combinedLlmsTxtPath, combinedLlmsTxt);
|
|
758
696
|
console.log(`\nš Generated combined llms.txt: ${combinedLlmsTxtPath}`);
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "extract-from-sitemap",
|
|
3
3
|
"bin": "cli.js",
|
|
4
|
-
"version": "0.0.
|
|
4
|
+
"version": "0.0.15",
|
|
5
5
|
"main": "mod.js",
|
|
6
6
|
"description": "A module and CLI that allows extracting all pages from a sitemap into markdown and a llms.txt, using Parallel.ai APIs.",
|
|
7
7
|
"files": [
|