extract-from-sitemap 0.0.3 → 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/cli.js +3 -22
- package/package.json +1 -1
package/cli.js
CHANGED
|
@@ -7,7 +7,7 @@ const crypto = require("crypto");
|
|
|
7
7
|
const http = require("http");
|
|
8
8
|
const { URL, URLSearchParams } = require("url");
|
|
9
9
|
const os = require("os");
|
|
10
|
-
|
|
10
|
+
const { extractFromSitemap } = require("./mod.js");
|
|
11
11
|
/**
|
|
12
12
|
* @typedef {Object} Config
|
|
13
13
|
* @property {string} outDir - Output directory for extracted files
|
|
@@ -410,6 +410,8 @@ async function processCustomUrls(customUrls, apiKey, forceExtract) {
|
|
|
410
410
|
tokens: Math.round((extracted.full_content || "").length / 5),
|
|
411
411
|
};
|
|
412
412
|
}
|
|
413
|
+
} else {
|
|
414
|
+
throw new Error(`${response.status} - ${await response.statusText()}`);
|
|
413
415
|
}
|
|
414
416
|
} catch (error) {
|
|
415
417
|
console.error(
|
|
@@ -438,27 +440,6 @@ async function clearCredentials() {
|
|
|
438
440
|
}
|
|
439
441
|
}
|
|
440
442
|
|
|
441
|
-
/**
|
|
442
|
-
* Extract content from sitemap (placeholder - you'll need to implement this)
|
|
443
|
-
* @param {string} origin - The origin URL
|
|
444
|
-
* @param {boolean} forceExtract - Whether to force extraction
|
|
445
|
-
* @param {string} apiKey - API key for authentication
|
|
446
|
-
* @returns {Promise<{totalPages: number, totalTokens: number, errors: number, files: Record<string, any>}>}
|
|
447
|
-
*/
|
|
448
|
-
async function extractFromSitemap(origin, forceExtract, apiKey) {
|
|
449
|
-
// This is a placeholder - you'll need to implement the actual extraction logic
|
|
450
|
-
// or import it from your mod.js file
|
|
451
|
-
console.log(`Extracting from ${origin} (force: ${forceExtract})`);
|
|
452
|
-
|
|
453
|
-
// For now, return empty result
|
|
454
|
-
return {
|
|
455
|
-
totalPages: 0,
|
|
456
|
-
totalTokens: 0,
|
|
457
|
-
errors: 0,
|
|
458
|
-
files: {},
|
|
459
|
-
};
|
|
460
|
-
}
|
|
461
|
-
|
|
462
443
|
/**
|
|
463
444
|
* Main function
|
|
464
445
|
*/
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "extract-from-sitemap",
|
|
3
3
|
"bin": "cli.js",
|
|
4
|
-
"version": "0.0.
|
|
4
|
+
"version": "0.0.4",
|
|
5
5
|
"main": "mod.js",
|
|
6
6
|
"description": "A module and CLI that allows extracting all pages from a sitemap into markdown and a llms.txt, using Parallel.ai APIs.",
|
|
7
7
|
"files": [
|