extract-from-sitemap 0.0.3 → 0.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/cli.js +44 -40
  2. package/package.json +1 -1
package/cli.js CHANGED
@@ -7,14 +7,20 @@ const crypto = require("crypto");
7
7
  const http = require("http");
8
8
  const { URL, URLSearchParams } = require("url");
9
9
  const os = require("os");
10
+ const { extractFromSitemap } = require("./mod.js");
11
+
12
+ /**
13
+ * @typedef {Object} OriginConfig
14
+ * @property {string} origin - The origin URL to process
15
+ * @property {boolean} forceExtract - Whether to force extraction for this origin
16
+ */
10
17
 
11
18
  /**
12
19
  * @typedef {Object} Config
13
20
  * @property {string} outDir - Output directory for extracted files
14
- * @property {string[]} origins - Array of origin URLs to process
21
+ * @property {OriginConfig[]} origins - Array of origin configurations
15
22
  * @property {Array<{title: string, description: string, url: string}>} customUrls - Custom URLs to extract
16
23
  * @property {boolean} keepOriginalUrls - Whether to keep original URL structure
17
- * @property {boolean} forceExtract - Whether to force extraction even if files exist
18
24
  */
19
25
 
20
26
  /**
@@ -209,11 +215,14 @@ async function loadConfig() {
209
215
  console.log(
210
216
  JSON.stringify(
211
217
  {
218
+ $schema: "https://extract.llmtext.com/llmtext.schema.json",
212
219
  outDir: "./docs",
213
- origins: ["https://docs.example.com"],
220
+ origins: [
221
+ { origin: "https://docs.parallel.ai", forceExtract: false },
222
+ { origin: "https://parallel.ai", forceExtract: true },
223
+ ],
214
224
  customUrls: [],
215
225
  keepOriginalUrls: false,
216
- forceExtract: false,
217
226
  },
218
227
  null,
219
228
  2
@@ -230,10 +239,22 @@ async function loadConfig() {
230
239
  if (!Array.isArray(config.origins))
231
240
  throw new Error("origins must be an array");
232
241
 
242
+ // Validate origin objects
243
+ for (const [index, originConfig] of config.origins.entries()) {
244
+ if (typeof originConfig !== "object" || originConfig === null) {
245
+ throw new Error(`origins[${index}] must be an object`);
246
+ }
247
+ if (!originConfig.origin) {
248
+ throw new Error(`origins[${index}].origin is required`);
249
+ }
250
+ if (typeof originConfig.forceExtract !== "boolean") {
251
+ throw new Error(`origins[${index}].forceExtract must be a boolean`);
252
+ }
253
+ }
254
+
233
255
  // Set defaults
234
256
  config.customUrls = config.customUrls || [];
235
257
  config.keepOriginalUrls = config.keepOriginalUrls ?? false;
236
- config.forceExtract = config.forceExtract ?? false;
237
258
 
238
259
  return config;
239
260
  } catch (error) {
@@ -370,10 +391,9 @@ function cleanupOldFiles(outDir, currentFiles, previousFiles) {
370
391
  * Process custom URLs through extraction API
371
392
  * @param {Array<{title: string, description: string, url: string}>} customUrls - Custom URLs to process
372
393
  * @param {string} apiKey - API key for authentication
373
- * @param {boolean} forceExtract - Whether to force extraction
374
394
  * @returns {Promise<Record<string, any>>} Extracted files
375
395
  */
376
- async function processCustomUrls(customUrls, apiKey, forceExtract) {
396
+ async function processCustomUrls(customUrls, apiKey) {
377
397
  const files = {};
378
398
 
379
399
  for (const customUrl of customUrls) {
@@ -410,6 +430,8 @@ async function processCustomUrls(customUrls, apiKey, forceExtract) {
410
430
  tokens: Math.round((extracted.full_content || "").length / 5),
411
431
  };
412
432
  }
433
+ } else {
434
+ throw new Error(`${response.status} - ${await response.statusText()}`);
413
435
  }
414
436
  } catch (error) {
415
437
  console.error(
@@ -438,27 +460,6 @@ async function clearCredentials() {
438
460
  }
439
461
  }
440
462
 
441
- /**
442
- * Extract content from sitemap (placeholder - you'll need to implement this)
443
- * @param {string} origin - The origin URL
444
- * @param {boolean} forceExtract - Whether to force extraction
445
- * @param {string} apiKey - API key for authentication
446
- * @returns {Promise<{totalPages: number, totalTokens: number, errors: number, files: Record<string, any>}>}
447
- */
448
- async function extractFromSitemap(origin, forceExtract, apiKey) {
449
- // This is a placeholder - you'll need to implement the actual extraction logic
450
- // or import it from your mod.js file
451
- console.log(`Extracting from ${origin} (force: ${forceExtract})`);
452
-
453
- // For now, return empty result
454
- return {
455
- totalPages: 0,
456
- totalTokens: 0,
457
- errors: 0,
458
- files: {},
459
- };
460
- }
461
-
462
463
  /**
463
464
  * Main function
464
465
  */
@@ -487,14 +488,16 @@ async function main() {
487
488
  let totalPages = 0;
488
489
  let totalErrors = 0;
489
490
 
490
- // Process each origin
491
- for (const origin of config.origins) {
492
- console.log(`\n🌐 Processing origin: ${origin}`);
491
+ // Process each origin with its own forceExtract setting
492
+ for (const originConfig of config.origins) {
493
+ console.log(
494
+ `\n🌐 Processing origin: ${originConfig.origin} (forceExtract: ${originConfig.forceExtract})`
495
+ );
493
496
 
494
497
  try {
495
498
  const result = await extractFromSitemap(
496
- origin,
497
- config.forceExtract,
499
+ originConfig.origin,
500
+ originConfig.forceExtract,
498
501
  apiKey
499
502
  );
500
503
 
@@ -512,7 +515,9 @@ async function main() {
512
515
  if (!config.keepOriginalUrls) {
513
516
  // Create domain-specific subdirectory
514
517
  const domain = new URL(
515
- origin.startsWith("http") ? origin : `https://${origin}`
518
+ originConfig.origin.startsWith("http")
519
+ ? originConfig.origin
520
+ : `https://${originConfig.origin}`
516
521
  ).hostname;
517
522
  const domainDir = path.join(config.outDir, domain);
518
523
  fs.mkdirSync(domainDir, { recursive: true });
@@ -538,7 +543,10 @@ async function main() {
538
543
  totalPages += result.totalPages;
539
544
  totalErrors += result.errors;
540
545
  } catch (error) {
541
- console.error(`āŒ Error processing ${origin}:`, error.message);
546
+ console.error(
547
+ `āŒ Error processing ${originConfig.origin}:`,
548
+ error.message
549
+ );
542
550
  totalErrors++;
543
551
  }
544
552
  }
@@ -546,11 +554,7 @@ async function main() {
546
554
  // Process custom URLs
547
555
  if (config.customUrls.length > 0) {
548
556
  console.log(`\nšŸ“‹ Processing ${config.customUrls.length} custom URLs...`);
549
- const customFiles = await processCustomUrls(
550
- config.customUrls,
551
- apiKey,
552
- config.forceExtract
553
- );
557
+ const customFiles = await processCustomUrls(config.customUrls, apiKey);
554
558
 
555
559
  for (const [filename, file] of Object.entries(customFiles)) {
556
560
  const filePath = path.join(config.outDir, filename);
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "extract-from-sitemap",
3
3
  "bin": "cli.js",
4
- "version": "0.0.3",
4
+ "version": "0.0.5",
5
5
  "main": "mod.js",
6
6
  "description": "A module and CLI that allows extracting all pages from a sitemap into markdown and a llms.txt, using Parallel.ai APIs.",
7
7
  "files": [