extract-from-sitemap 0.0.4 → 0.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/cli.js +42 -19
  2. package/package.json +1 -1
package/cli.js CHANGED
@@ -8,13 +8,19 @@ const http = require("http");
8
8
  const { URL, URLSearchParams } = require("url");
9
9
  const os = require("os");
10
10
  const { extractFromSitemap } = require("./mod.js");
11
+
12
+ /**
13
+ * @typedef {Object} OriginConfig
14
+ * @property {string} origin - The origin URL to process
15
+ * @property {boolean} forceExtract - Whether to force extraction for this origin
16
+ */
17
+
11
18
  /**
12
19
  * @typedef {Object} Config
13
20
  * @property {string} outDir - Output directory for extracted files
14
- * @property {string[]} origins - Array of origin URLs to process
21
+ * @property {OriginConfig[]} origins - Array of origin configurations
15
22
  * @property {Array<{title: string, description: string, url: string}>} customUrls - Custom URLs to extract
16
23
  * @property {boolean} keepOriginalUrls - Whether to keep original URL structure
17
- * @property {boolean} forceExtract - Whether to force extraction even if files exist
18
24
  */
19
25
 
20
26
  /**
@@ -209,11 +215,14 @@ async function loadConfig() {
209
215
  console.log(
210
216
  JSON.stringify(
211
217
  {
218
+ $schema: "https://extract.llmtext.com/llmtext.schema.json",
212
219
  outDir: "./docs",
213
- origins: ["https://docs.example.com"],
220
+ origins: [
221
+ { origin: "https://docs.parallel.ai", forceExtract: false },
222
+ { origin: "https://parallel.ai", forceExtract: true },
223
+ ],
214
224
  customUrls: [],
215
225
  keepOriginalUrls: false,
216
- forceExtract: false,
217
226
  },
218
227
  null,
219
228
  2
@@ -230,10 +239,22 @@ async function loadConfig() {
230
239
  if (!Array.isArray(config.origins))
231
240
  throw new Error("origins must be an array");
232
241
 
242
+ // Validate origin objects
243
+ for (const [index, originConfig] of config.origins.entries()) {
244
+ if (typeof originConfig !== "object" || originConfig === null) {
245
+ throw new Error(`origins[${index}] must be an object`);
246
+ }
247
+ if (!originConfig.origin) {
248
+ throw new Error(`origins[${index}].origin is required`);
249
+ }
250
+ if (typeof originConfig.forceExtract !== "boolean") {
251
+ throw new Error(`origins[${index}].forceExtract must be a boolean`);
252
+ }
253
+ }
254
+
233
255
  // Set defaults
234
256
  config.customUrls = config.customUrls || [];
235
257
  config.keepOriginalUrls = config.keepOriginalUrls ?? false;
236
- config.forceExtract = config.forceExtract ?? false;
237
258
 
238
259
  return config;
239
260
  } catch (error) {
@@ -370,10 +391,9 @@ function cleanupOldFiles(outDir, currentFiles, previousFiles) {
370
391
  * Process custom URLs through extraction API
371
392
  * @param {Array<{title: string, description: string, url: string}>} customUrls - Custom URLs to process
372
393
  * @param {string} apiKey - API key for authentication
373
- * @param {boolean} forceExtract - Whether to force extraction
374
394
  * @returns {Promise<Record<string, any>>} Extracted files
375
395
  */
376
- async function processCustomUrls(customUrls, apiKey, forceExtract) {
396
+ async function processCustomUrls(customUrls, apiKey) {
377
397
  const files = {};
378
398
 
379
399
  for (const customUrl of customUrls) {
@@ -468,14 +488,16 @@ async function main() {
468
488
  let totalPages = 0;
469
489
  let totalErrors = 0;
470
490
 
471
- // Process each origin
472
- for (const origin of config.origins) {
473
- console.log(`\n🌐 Processing origin: ${origin}`);
491
+ // Process each origin with its own forceExtract setting
492
+ for (const originConfig of config.origins) {
493
+ console.log(
494
+ `\n🌐 Processing origin: ${originConfig.origin} (forceExtract: ${originConfig.forceExtract})`
495
+ );
474
496
 
475
497
  try {
476
498
  const result = await extractFromSitemap(
477
- origin,
478
- config.forceExtract,
499
+ originConfig.origin,
500
+ originConfig.forceExtract,
479
501
  apiKey
480
502
  );
481
503
 
@@ -493,7 +515,9 @@ async function main() {
493
515
  if (!config.keepOriginalUrls) {
494
516
  // Create domain-specific subdirectory
495
517
  const domain = new URL(
496
- origin.startsWith("http") ? origin : `https://${origin}`
518
+ originConfig.origin.startsWith("http")
519
+ ? originConfig.origin
520
+ : `https://${originConfig.origin}`
497
521
  ).hostname;
498
522
  const domainDir = path.join(config.outDir, domain);
499
523
  fs.mkdirSync(domainDir, { recursive: true });
@@ -519,7 +543,10 @@ async function main() {
519
543
  totalPages += result.totalPages;
520
544
  totalErrors += result.errors;
521
545
  } catch (error) {
522
- console.error(`āŒ Error processing ${origin}:`, error.message);
546
+ console.error(
547
+ `āŒ Error processing ${originConfig.origin}:`,
548
+ error.message
549
+ );
523
550
  totalErrors++;
524
551
  }
525
552
  }
@@ -527,11 +554,7 @@ async function main() {
527
554
  // Process custom URLs
528
555
  if (config.customUrls.length > 0) {
529
556
  console.log(`\nšŸ“‹ Processing ${config.customUrls.length} custom URLs...`);
530
- const customFiles = await processCustomUrls(
531
- config.customUrls,
532
- apiKey,
533
- config.forceExtract
534
- );
557
+ const customFiles = await processCustomUrls(config.customUrls, apiKey);
535
558
 
536
559
  for (const [filename, file] of Object.entries(customFiles)) {
537
560
  const filePath = path.join(config.outDir, filename);
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "extract-from-sitemap",
3
3
  "bin": "cli.js",
4
- "version": "0.0.4",
4
+ "version": "0.0.5",
5
5
  "main": "mod.js",
6
6
  "description": "A module and CLI that allows extracting all pages from a sitemap into markdown and a llms.txt, using Parallel.ai APIs.",
7
7
  "files": [