extract-from-sitemap 0.0.4 ā 0.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/cli.js +42 -19
- package/package.json +1 -1
package/cli.js
CHANGED
|
@@ -8,13 +8,19 @@ const http = require("http");
|
|
|
8
8
|
const { URL, URLSearchParams } = require("url");
|
|
9
9
|
const os = require("os");
|
|
10
10
|
const { extractFromSitemap } = require("./mod.js");
|
|
11
|
+
|
|
12
|
+
/**
|
|
13
|
+
* @typedef {Object} OriginConfig
|
|
14
|
+
* @property {string} origin - The origin URL to process
|
|
15
|
+
* @property {boolean} forceExtract - Whether to force extraction for this origin
|
|
16
|
+
*/
|
|
17
|
+
|
|
11
18
|
/**
|
|
12
19
|
* @typedef {Object} Config
|
|
13
20
|
* @property {string} outDir - Output directory for extracted files
|
|
14
|
-
* @property {
|
|
21
|
+
* @property {OriginConfig[]} origins - Array of origin configurations
|
|
15
22
|
* @property {Array<{title: string, description: string, url: string}>} customUrls - Custom URLs to extract
|
|
16
23
|
* @property {boolean} keepOriginalUrls - Whether to keep original URL structure
|
|
17
|
-
* @property {boolean} forceExtract - Whether to force extraction even if files exist
|
|
18
24
|
*/
|
|
19
25
|
|
|
20
26
|
/**
|
|
@@ -209,11 +215,14 @@ async function loadConfig() {
|
|
|
209
215
|
console.log(
|
|
210
216
|
JSON.stringify(
|
|
211
217
|
{
|
|
218
|
+
$schema: "https://extract.llmtext.com/llmtext.schema.json",
|
|
212
219
|
outDir: "./docs",
|
|
213
|
-
origins: [
|
|
220
|
+
origins: [
|
|
221
|
+
{ origin: "https://docs.parallel.ai", forceExtract: false },
|
|
222
|
+
{ origin: "https://parallel.ai", forceExtract: true },
|
|
223
|
+
],
|
|
214
224
|
customUrls: [],
|
|
215
225
|
keepOriginalUrls: false,
|
|
216
|
-
forceExtract: false,
|
|
217
226
|
},
|
|
218
227
|
null,
|
|
219
228
|
2
|
|
@@ -230,10 +239,22 @@ async function loadConfig() {
|
|
|
230
239
|
if (!Array.isArray(config.origins))
|
|
231
240
|
throw new Error("origins must be an array");
|
|
232
241
|
|
|
242
|
+
// Validate origin objects
|
|
243
|
+
for (const [index, originConfig] of config.origins.entries()) {
|
|
244
|
+
if (typeof originConfig !== "object" || originConfig === null) {
|
|
245
|
+
throw new Error(`origins[${index}] must be an object`);
|
|
246
|
+
}
|
|
247
|
+
if (!originConfig.origin) {
|
|
248
|
+
throw new Error(`origins[${index}].origin is required`);
|
|
249
|
+
}
|
|
250
|
+
if (typeof originConfig.forceExtract !== "boolean") {
|
|
251
|
+
throw new Error(`origins[${index}].forceExtract must be a boolean`);
|
|
252
|
+
}
|
|
253
|
+
}
|
|
254
|
+
|
|
233
255
|
// Set defaults
|
|
234
256
|
config.customUrls = config.customUrls || [];
|
|
235
257
|
config.keepOriginalUrls = config.keepOriginalUrls ?? false;
|
|
236
|
-
config.forceExtract = config.forceExtract ?? false;
|
|
237
258
|
|
|
238
259
|
return config;
|
|
239
260
|
} catch (error) {
|
|
@@ -370,10 +391,9 @@ function cleanupOldFiles(outDir, currentFiles, previousFiles) {
|
|
|
370
391
|
* Process custom URLs through extraction API
|
|
371
392
|
* @param {Array<{title: string, description: string, url: string}>} customUrls - Custom URLs to process
|
|
372
393
|
* @param {string} apiKey - API key for authentication
|
|
373
|
-
* @param {boolean} forceExtract - Whether to force extraction
|
|
374
394
|
* @returns {Promise<Record<string, any>>} Extracted files
|
|
375
395
|
*/
|
|
376
|
-
async function processCustomUrls(customUrls, apiKey
|
|
396
|
+
async function processCustomUrls(customUrls, apiKey) {
|
|
377
397
|
const files = {};
|
|
378
398
|
|
|
379
399
|
for (const customUrl of customUrls) {
|
|
@@ -468,14 +488,16 @@ async function main() {
|
|
|
468
488
|
let totalPages = 0;
|
|
469
489
|
let totalErrors = 0;
|
|
470
490
|
|
|
471
|
-
// Process each origin
|
|
472
|
-
for (const
|
|
473
|
-
console.log(
|
|
491
|
+
// Process each origin with its own forceExtract setting
|
|
492
|
+
for (const originConfig of config.origins) {
|
|
493
|
+
console.log(
|
|
494
|
+
`\nš Processing origin: ${originConfig.origin} (forceExtract: ${originConfig.forceExtract})`
|
|
495
|
+
);
|
|
474
496
|
|
|
475
497
|
try {
|
|
476
498
|
const result = await extractFromSitemap(
|
|
477
|
-
origin,
|
|
478
|
-
|
|
499
|
+
originConfig.origin,
|
|
500
|
+
originConfig.forceExtract,
|
|
479
501
|
apiKey
|
|
480
502
|
);
|
|
481
503
|
|
|
@@ -493,7 +515,9 @@ async function main() {
|
|
|
493
515
|
if (!config.keepOriginalUrls) {
|
|
494
516
|
// Create domain-specific subdirectory
|
|
495
517
|
const domain = new URL(
|
|
496
|
-
origin.startsWith("http")
|
|
518
|
+
originConfig.origin.startsWith("http")
|
|
519
|
+
? originConfig.origin
|
|
520
|
+
: `https://${originConfig.origin}`
|
|
497
521
|
).hostname;
|
|
498
522
|
const domainDir = path.join(config.outDir, domain);
|
|
499
523
|
fs.mkdirSync(domainDir, { recursive: true });
|
|
@@ -519,7 +543,10 @@ async function main() {
|
|
|
519
543
|
totalPages += result.totalPages;
|
|
520
544
|
totalErrors += result.errors;
|
|
521
545
|
} catch (error) {
|
|
522
|
-
console.error(
|
|
546
|
+
console.error(
|
|
547
|
+
`ā Error processing ${originConfig.origin}:`,
|
|
548
|
+
error.message
|
|
549
|
+
);
|
|
523
550
|
totalErrors++;
|
|
524
551
|
}
|
|
525
552
|
}
|
|
@@ -527,11 +554,7 @@ async function main() {
|
|
|
527
554
|
// Process custom URLs
|
|
528
555
|
if (config.customUrls.length > 0) {
|
|
529
556
|
console.log(`\nš Processing ${config.customUrls.length} custom URLs...`);
|
|
530
|
-
const customFiles = await processCustomUrls(
|
|
531
|
-
config.customUrls,
|
|
532
|
-
apiKey,
|
|
533
|
-
config.forceExtract
|
|
534
|
-
);
|
|
557
|
+
const customFiles = await processCustomUrls(config.customUrls, apiKey);
|
|
535
558
|
|
|
536
559
|
for (const [filename, file] of Object.entries(customFiles)) {
|
|
537
560
|
const filePath = path.join(config.outDir, filename);
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "extract-from-sitemap",
|
|
3
3
|
"bin": "cli.js",
|
|
4
|
-
"version": "0.0.
|
|
4
|
+
"version": "0.0.5",
|
|
5
5
|
"main": "mod.js",
|
|
6
6
|
"description": "A module and CLI that allows extracting all pages from a sitemap into markdown and a llms.txt, using Parallel.ai APIs.",
|
|
7
7
|
"files": [
|