extract-from-sitemap 0.0.3 ā 0.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/cli.js +44 -40
- package/package.json +1 -1
package/cli.js
CHANGED
|
@@ -7,14 +7,20 @@ const crypto = require("crypto");
|
|
|
7
7
|
const http = require("http");
|
|
8
8
|
const { URL, URLSearchParams } = require("url");
|
|
9
9
|
const os = require("os");
|
|
10
|
+
const { extractFromSitemap } = require("./mod.js");
|
|
11
|
+
|
|
12
|
+
/**
|
|
13
|
+
* @typedef {Object} OriginConfig
|
|
14
|
+
* @property {string} origin - The origin URL to process
|
|
15
|
+
* @property {boolean} forceExtract - Whether to force extraction for this origin
|
|
16
|
+
*/
|
|
10
17
|
|
|
11
18
|
/**
|
|
12
19
|
* @typedef {Object} Config
|
|
13
20
|
* @property {string} outDir - Output directory for extracted files
|
|
14
|
-
* @property {
|
|
21
|
+
* @property {OriginConfig[]} origins - Array of origin configurations
|
|
15
22
|
* @property {Array<{title: string, description: string, url: string}>} customUrls - Custom URLs to extract
|
|
16
23
|
* @property {boolean} keepOriginalUrls - Whether to keep original URL structure
|
|
17
|
-
* @property {boolean} forceExtract - Whether to force extraction even if files exist
|
|
18
24
|
*/
|
|
19
25
|
|
|
20
26
|
/**
|
|
@@ -209,11 +215,14 @@ async function loadConfig() {
|
|
|
209
215
|
console.log(
|
|
210
216
|
JSON.stringify(
|
|
211
217
|
{
|
|
218
|
+
$schema: "https://extract.llmtext.com/llmtext.schema.json",
|
|
212
219
|
outDir: "./docs",
|
|
213
|
-
origins: [
|
|
220
|
+
origins: [
|
|
221
|
+
{ origin: "https://docs.parallel.ai", forceExtract: false },
|
|
222
|
+
{ origin: "https://parallel.ai", forceExtract: true },
|
|
223
|
+
],
|
|
214
224
|
customUrls: [],
|
|
215
225
|
keepOriginalUrls: false,
|
|
216
|
-
forceExtract: false,
|
|
217
226
|
},
|
|
218
227
|
null,
|
|
219
228
|
2
|
|
@@ -230,10 +239,22 @@ async function loadConfig() {
|
|
|
230
239
|
if (!Array.isArray(config.origins))
|
|
231
240
|
throw new Error("origins must be an array");
|
|
232
241
|
|
|
242
|
+
// Validate origin objects
|
|
243
|
+
for (const [index, originConfig] of config.origins.entries()) {
|
|
244
|
+
if (typeof originConfig !== "object" || originConfig === null) {
|
|
245
|
+
throw new Error(`origins[${index}] must be an object`);
|
|
246
|
+
}
|
|
247
|
+
if (!originConfig.origin) {
|
|
248
|
+
throw new Error(`origins[${index}].origin is required`);
|
|
249
|
+
}
|
|
250
|
+
if (typeof originConfig.forceExtract !== "boolean") {
|
|
251
|
+
throw new Error(`origins[${index}].forceExtract must be a boolean`);
|
|
252
|
+
}
|
|
253
|
+
}
|
|
254
|
+
|
|
233
255
|
// Set defaults
|
|
234
256
|
config.customUrls = config.customUrls || [];
|
|
235
257
|
config.keepOriginalUrls = config.keepOriginalUrls ?? false;
|
|
236
|
-
config.forceExtract = config.forceExtract ?? false;
|
|
237
258
|
|
|
238
259
|
return config;
|
|
239
260
|
} catch (error) {
|
|
@@ -370,10 +391,9 @@ function cleanupOldFiles(outDir, currentFiles, previousFiles) {
|
|
|
370
391
|
* Process custom URLs through extraction API
|
|
371
392
|
* @param {Array<{title: string, description: string, url: string}>} customUrls - Custom URLs to process
|
|
372
393
|
* @param {string} apiKey - API key for authentication
|
|
373
|
-
* @param {boolean} forceExtract - Whether to force extraction
|
|
374
394
|
* @returns {Promise<Record<string, any>>} Extracted files
|
|
375
395
|
*/
|
|
376
|
-
async function processCustomUrls(customUrls, apiKey
|
|
396
|
+
async function processCustomUrls(customUrls, apiKey) {
|
|
377
397
|
const files = {};
|
|
378
398
|
|
|
379
399
|
for (const customUrl of customUrls) {
|
|
@@ -410,6 +430,8 @@ async function processCustomUrls(customUrls, apiKey, forceExtract) {
|
|
|
410
430
|
tokens: Math.round((extracted.full_content || "").length / 5),
|
|
411
431
|
};
|
|
412
432
|
}
|
|
433
|
+
} else {
|
|
434
|
+
throw new Error(`${response.status} - ${await response.statusText()}`);
|
|
413
435
|
}
|
|
414
436
|
} catch (error) {
|
|
415
437
|
console.error(
|
|
@@ -438,27 +460,6 @@ async function clearCredentials() {
|
|
|
438
460
|
}
|
|
439
461
|
}
|
|
440
462
|
|
|
441
|
-
/**
|
|
442
|
-
* Extract content from sitemap (placeholder - you'll need to implement this)
|
|
443
|
-
* @param {string} origin - The origin URL
|
|
444
|
-
* @param {boolean} forceExtract - Whether to force extraction
|
|
445
|
-
* @param {string} apiKey - API key for authentication
|
|
446
|
-
* @returns {Promise<{totalPages: number, totalTokens: number, errors: number, files: Record<string, any>}>}
|
|
447
|
-
*/
|
|
448
|
-
async function extractFromSitemap(origin, forceExtract, apiKey) {
|
|
449
|
-
// This is a placeholder - you'll need to implement the actual extraction logic
|
|
450
|
-
// or import it from your mod.js file
|
|
451
|
-
console.log(`Extracting from ${origin} (force: ${forceExtract})`);
|
|
452
|
-
|
|
453
|
-
// For now, return empty result
|
|
454
|
-
return {
|
|
455
|
-
totalPages: 0,
|
|
456
|
-
totalTokens: 0,
|
|
457
|
-
errors: 0,
|
|
458
|
-
files: {},
|
|
459
|
-
};
|
|
460
|
-
}
|
|
461
|
-
|
|
462
463
|
/**
|
|
463
464
|
* Main function
|
|
464
465
|
*/
|
|
@@ -487,14 +488,16 @@ async function main() {
|
|
|
487
488
|
let totalPages = 0;
|
|
488
489
|
let totalErrors = 0;
|
|
489
490
|
|
|
490
|
-
// Process each origin
|
|
491
|
-
for (const
|
|
492
|
-
console.log(
|
|
491
|
+
// Process each origin with its own forceExtract setting
|
|
492
|
+
for (const originConfig of config.origins) {
|
|
493
|
+
console.log(
|
|
494
|
+
`\nš Processing origin: ${originConfig.origin} (forceExtract: ${originConfig.forceExtract})`
|
|
495
|
+
);
|
|
493
496
|
|
|
494
497
|
try {
|
|
495
498
|
const result = await extractFromSitemap(
|
|
496
|
-
origin,
|
|
497
|
-
|
|
499
|
+
originConfig.origin,
|
|
500
|
+
originConfig.forceExtract,
|
|
498
501
|
apiKey
|
|
499
502
|
);
|
|
500
503
|
|
|
@@ -512,7 +515,9 @@ async function main() {
|
|
|
512
515
|
if (!config.keepOriginalUrls) {
|
|
513
516
|
// Create domain-specific subdirectory
|
|
514
517
|
const domain = new URL(
|
|
515
|
-
origin.startsWith("http")
|
|
518
|
+
originConfig.origin.startsWith("http")
|
|
519
|
+
? originConfig.origin
|
|
520
|
+
: `https://${originConfig.origin}`
|
|
516
521
|
).hostname;
|
|
517
522
|
const domainDir = path.join(config.outDir, domain);
|
|
518
523
|
fs.mkdirSync(domainDir, { recursive: true });
|
|
@@ -538,7 +543,10 @@ async function main() {
|
|
|
538
543
|
totalPages += result.totalPages;
|
|
539
544
|
totalErrors += result.errors;
|
|
540
545
|
} catch (error) {
|
|
541
|
-
console.error(
|
|
546
|
+
console.error(
|
|
547
|
+
`ā Error processing ${originConfig.origin}:`,
|
|
548
|
+
error.message
|
|
549
|
+
);
|
|
542
550
|
totalErrors++;
|
|
543
551
|
}
|
|
544
552
|
}
|
|
@@ -546,11 +554,7 @@ async function main() {
|
|
|
546
554
|
// Process custom URLs
|
|
547
555
|
if (config.customUrls.length > 0) {
|
|
548
556
|
console.log(`\nš Processing ${config.customUrls.length} custom URLs...`);
|
|
549
|
-
const customFiles = await processCustomUrls(
|
|
550
|
-
config.customUrls,
|
|
551
|
-
apiKey,
|
|
552
|
-
config.forceExtract
|
|
553
|
-
);
|
|
557
|
+
const customFiles = await processCustomUrls(config.customUrls, apiKey);
|
|
554
558
|
|
|
555
559
|
for (const [filename, file] of Object.entries(customFiles)) {
|
|
556
560
|
const filePath = path.join(config.outDir, filename);
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "extract-from-sitemap",
|
|
3
3
|
"bin": "cli.js",
|
|
4
|
-
"version": "0.0.
|
|
4
|
+
"version": "0.0.5",
|
|
5
5
|
"main": "mod.js",
|
|
6
6
|
"description": "A module and CLI that allows extracting all pages from a sitemap into markdown and a llms.txt, using Parallel.ai APIs.",
|
|
7
7
|
"files": [
|