extract-from-sitemap 0.0.2 → 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/{cli.ts → cli.js} +225 -196
- package/package.json +4 -5
package/{cli.ts → cli.js}
RENAMED
|
@@ -1,52 +1,47 @@
|
|
|
1
|
-
#!/usr/bin/env
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
}
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
}
|
|
33
|
-
|
|
34
|
-
const SECRETS_SERVICE = "extract-from-sitemap-cli";
|
|
35
|
-
const SECRETS_KEY = "parallel-api-key";
|
|
36
|
-
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
const fs = require("fs");
|
|
4
|
+
const path = require("path");
|
|
5
|
+
const { spawn } = require("child_process");
|
|
6
|
+
const crypto = require("crypto");
|
|
7
|
+
const http = require("http");
|
|
8
|
+
const { URL, URLSearchParams } = require("url");
|
|
9
|
+
const os = require("os");
|
|
10
|
+
const { extractFromSitemap } = require("./mod.js");
|
|
11
|
+
/**
|
|
12
|
+
* @typedef {Object} Config
|
|
13
|
+
* @property {string} outDir - Output directory for extracted files
|
|
14
|
+
* @property {string[]} origins - Array of origin URLs to process
|
|
15
|
+
* @property {Array<{title: string, description: string, url: string}>} customUrls - Custom URLs to extract
|
|
16
|
+
* @property {boolean} keepOriginalUrls - Whether to keep original URL structure
|
|
17
|
+
* @property {boolean} forceExtract - Whether to force extraction even if files exist
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
/**
|
|
21
|
+
* @typedef {Object} Manifest
|
|
22
|
+
* @property {string[]} files - List of generated files
|
|
23
|
+
* @property {string} timestamp - Timestamp of last generation
|
|
24
|
+
*/
|
|
25
|
+
|
|
26
|
+
const CREDENTIALS_DIR = path.join(os.homedir(), ".llmtext");
|
|
27
|
+
const API_KEY_FILE = path.join(CREDENTIALS_DIR, "api-key");
|
|
28
|
+
|
|
29
|
+
/**
|
|
30
|
+
* OAuth handler for Parallel.ai API key authentication
|
|
31
|
+
*/
|
|
37
32
|
class OAuth {
|
|
38
|
-
private clientId: string;
|
|
39
|
-
private redirectUri: string;
|
|
40
|
-
private scope: string;
|
|
41
|
-
private server?: Bun.Server;
|
|
42
|
-
|
|
43
33
|
constructor() {
|
|
44
34
|
this.clientId = "extract-from-sitemap-cli";
|
|
45
35
|
this.redirectUri = "http://localhost:3737/callback";
|
|
46
36
|
this.scope = "key:read";
|
|
37
|
+
this.server = null;
|
|
47
38
|
}
|
|
48
39
|
|
|
49
|
-
|
|
40
|
+
/**
|
|
41
|
+
* Get API key through OAuth flow
|
|
42
|
+
* @returns {Promise<string>} The API key
|
|
43
|
+
*/
|
|
44
|
+
async getApiKey() {
|
|
50
45
|
console.log("🔐 Starting OAuth flow...");
|
|
51
46
|
|
|
52
47
|
// Generate PKCE parameters
|
|
@@ -62,7 +57,7 @@ class OAuth {
|
|
|
62
57
|
authUrl.searchParams.set("code_challenge_method", "S256");
|
|
63
58
|
authUrl.searchParams.set("state", Math.random().toString(36));
|
|
64
59
|
|
|
65
|
-
console.log(
|
|
60
|
+
console.log("\n📖 Opening browser for authorization...");
|
|
66
61
|
|
|
67
62
|
// Open browser automatically
|
|
68
63
|
await this.openBrowser(authUrl.toString());
|
|
@@ -97,13 +92,14 @@ class OAuth {
|
|
|
97
92
|
return access_token;
|
|
98
93
|
}
|
|
99
94
|
|
|
100
|
-
|
|
95
|
+
/**
|
|
96
|
+
* Open browser to authorization URL
|
|
97
|
+
* @param {string} url - The authorization URL
|
|
98
|
+
*/
|
|
99
|
+
async openBrowser(url) {
|
|
101
100
|
try {
|
|
102
|
-
const { spawn } = require("child_process");
|
|
103
101
|
const platform = process.platform;
|
|
104
|
-
|
|
105
|
-
let command: string;
|
|
106
|
-
let args: string[];
|
|
102
|
+
let command, args;
|
|
107
103
|
|
|
108
104
|
if (platform === "darwin") {
|
|
109
105
|
command = "open";
|
|
@@ -119,90 +115,93 @@ class OAuth {
|
|
|
119
115
|
|
|
120
116
|
spawn(command, args, { detached: true, stdio: "ignore" });
|
|
121
117
|
} catch (error) {
|
|
122
|
-
console.log(
|
|
118
|
+
console.log("\n📖 Please visit this URL to authorize the application:");
|
|
123
119
|
console.log(`${url}\n`);
|
|
124
120
|
}
|
|
125
121
|
}
|
|
126
122
|
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
"
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
return { codeVerifier, codeChallenge };
|
|
123
|
+
/**
|
|
124
|
+
* Generate PKCE code verifier and challenge
|
|
125
|
+
* @returns {Promise<{codeVerifier: string, codeChallenge: string}>}
|
|
126
|
+
*/
|
|
127
|
+
async generatePKCE() {
|
|
128
|
+
const codeVerifier = crypto.randomBytes(32).toString("base64url");
|
|
129
|
+
const hash = crypto
|
|
130
|
+
.createHash("sha256")
|
|
131
|
+
.update(codeVerifier)
|
|
132
|
+
.digest("base64url");
|
|
133
|
+
|
|
134
|
+
return {
|
|
135
|
+
codeVerifier,
|
|
136
|
+
codeChallenge: hash,
|
|
137
|
+
};
|
|
144
138
|
}
|
|
145
139
|
|
|
146
|
-
|
|
140
|
+
/**
|
|
141
|
+
* Start HTTP server to catch OAuth callback
|
|
142
|
+
* @returns {Promise<string>} The authorization code
|
|
143
|
+
*/
|
|
144
|
+
async startCallbackServer() {
|
|
147
145
|
return new Promise((resolve, reject) => {
|
|
148
|
-
this.server =
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
"Error occurred. You can close this window.",
|
|
161
|
-
{ status: 400 }
|
|
162
|
-
);
|
|
163
|
-
}
|
|
164
|
-
|
|
165
|
-
if (code) {
|
|
166
|
-
resolve(code);
|
|
167
|
-
// Don't stop server here - let the cleanup happen in the finally block
|
|
168
|
-
return new Response(
|
|
169
|
-
"✅ Authorization successful! You can close this window and return to the terminal.",
|
|
170
|
-
{
|
|
171
|
-
headers: {
|
|
172
|
-
"Content-Type": "text/html",
|
|
173
|
-
},
|
|
174
|
-
}
|
|
175
|
-
);
|
|
176
|
-
}
|
|
146
|
+
this.server = http.createServer((req, res) => {
|
|
147
|
+
const url = new URL(req.url, `http://${req.headers.host}`);
|
|
148
|
+
|
|
149
|
+
if (url.pathname === "/callback") {
|
|
150
|
+
const code = url.searchParams.get("code");
|
|
151
|
+
const error = url.searchParams.get("error");
|
|
152
|
+
|
|
153
|
+
if (error) {
|
|
154
|
+
reject(new Error(`OAuth error: ${error}`));
|
|
155
|
+
res.writeHead(400);
|
|
156
|
+
res.end("Error occurred. You can close this window.");
|
|
157
|
+
return;
|
|
177
158
|
}
|
|
178
159
|
|
|
179
|
-
|
|
180
|
-
|
|
160
|
+
if (code) {
|
|
161
|
+
resolve(code);
|
|
162
|
+
res.writeHead(200, { "Content-Type": "text/html" });
|
|
163
|
+
res.end(
|
|
164
|
+
"✅ Authorization successful! You can close this window and return to the terminal."
|
|
165
|
+
);
|
|
166
|
+
return;
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
res.writeHead(404);
|
|
171
|
+
res.end("Invalid request");
|
|
181
172
|
});
|
|
182
173
|
|
|
174
|
+
this.server.listen(3737);
|
|
175
|
+
|
|
183
176
|
// Timeout after 5 minutes
|
|
184
177
|
setTimeout(() => {
|
|
185
178
|
this.stopServer();
|
|
186
179
|
reject(new Error("OAuth flow timed out"));
|
|
187
180
|
}, 300000);
|
|
188
181
|
}).finally(() => {
|
|
189
|
-
// Ensure server is stopped after promise resolves or rejects
|
|
190
182
|
this.stopServer();
|
|
191
183
|
});
|
|
192
184
|
}
|
|
193
185
|
|
|
194
|
-
|
|
186
|
+
/**
|
|
187
|
+
* Stop the callback server
|
|
188
|
+
*/
|
|
189
|
+
stopServer() {
|
|
195
190
|
if (this.server) {
|
|
196
|
-
this.server.
|
|
197
|
-
this.server =
|
|
191
|
+
this.server.close();
|
|
192
|
+
this.server = null;
|
|
198
193
|
}
|
|
199
194
|
}
|
|
200
195
|
}
|
|
201
196
|
|
|
202
|
-
|
|
203
|
-
|
|
197
|
+
/**
|
|
198
|
+
* Load configuration from llmtext.json
|
|
199
|
+
* @returns {Promise<Config>} The configuration object
|
|
200
|
+
*/
|
|
201
|
+
async function loadConfig() {
|
|
202
|
+
const configPath = path.resolve("llmtext.json");
|
|
204
203
|
|
|
205
|
-
if (!existsSync(configPath)) {
|
|
204
|
+
if (!fs.existsSync(configPath)) {
|
|
206
205
|
console.error(
|
|
207
206
|
"❌ llmtext.json not found. Please create a configuration file."
|
|
208
207
|
);
|
|
@@ -224,7 +223,7 @@ async function loadConfig(): Promise<Config> {
|
|
|
224
223
|
}
|
|
225
224
|
|
|
226
225
|
try {
|
|
227
|
-
const config = JSON.parse(readFileSync(configPath, "utf8"))
|
|
226
|
+
const config = JSON.parse(fs.readFileSync(configPath, "utf8"));
|
|
228
227
|
|
|
229
228
|
// Validate required fields
|
|
230
229
|
if (!config.outDir) throw new Error("outDir is required");
|
|
@@ -243,28 +242,56 @@ async function loadConfig(): Promise<Config> {
|
|
|
243
242
|
}
|
|
244
243
|
}
|
|
245
244
|
|
|
246
|
-
|
|
247
|
-
|
|
245
|
+
/**
|
|
246
|
+
* Store API key in ~/.llmtext/api-key
|
|
247
|
+
* @param {string} apiKey - The API key to store
|
|
248
|
+
*/
|
|
249
|
+
function storeApiKey(apiKey) {
|
|
248
250
|
try {
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
251
|
+
fs.mkdirSync(CREDENTIALS_DIR, { recursive: true });
|
|
252
|
+
fs.writeFileSync(API_KEY_FILE, apiKey, { mode: 0o600 }); // Only owner can read
|
|
253
|
+
console.log("💾 API key stored securely in ~/.llmtext/api-key");
|
|
254
|
+
} catch (error) {
|
|
255
|
+
console.warn("⚠️ Could not store API key:", error.message);
|
|
256
|
+
}
|
|
257
|
+
}
|
|
253
258
|
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
259
|
+
/**
|
|
260
|
+
* Load API key from ~/.llmtext/api-key
|
|
261
|
+
* @returns {string|null} The stored API key or null if not found
|
|
262
|
+
*/
|
|
263
|
+
function loadStoredApiKey() {
|
|
264
|
+
try {
|
|
265
|
+
if (fs.existsSync(API_KEY_FILE)) {
|
|
266
|
+
const apiKey = fs.readFileSync(API_KEY_FILE, "utf8").trim();
|
|
267
|
+
if (apiKey) {
|
|
268
|
+
console.log("🔑 Using stored API key from ~/.llmtext/api-key");
|
|
269
|
+
return apiKey;
|
|
270
|
+
}
|
|
257
271
|
}
|
|
258
272
|
} catch (error) {
|
|
259
|
-
console.warn("⚠️ Could not
|
|
273
|
+
console.warn("⚠️ Could not read stored API key:", error.message);
|
|
274
|
+
}
|
|
275
|
+
return null;
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
/**
|
|
279
|
+
* Get API key from various sources or start OAuth flow
|
|
280
|
+
* @returns {Promise<string>} The API key
|
|
281
|
+
*/
|
|
282
|
+
async function getApiKey() {
|
|
283
|
+
// Check stored API key first
|
|
284
|
+
const storedKey = loadStoredApiKey();
|
|
285
|
+
if (storedKey) {
|
|
286
|
+
return storedKey;
|
|
260
287
|
}
|
|
261
288
|
|
|
262
|
-
// Check environment variables
|
|
289
|
+
// Check environment variables
|
|
263
290
|
let apiKey = process.env.PARALLEL_API_KEY;
|
|
264
291
|
|
|
265
|
-
if (!apiKey && existsSync(".env")) {
|
|
292
|
+
if (!apiKey && fs.existsSync(".env")) {
|
|
266
293
|
// Try to load from .env file
|
|
267
|
-
const envContent = readFileSync(".env", "utf8");
|
|
294
|
+
const envContent = fs.readFileSync(".env", "utf8");
|
|
268
295
|
const match = envContent.match(/^PARALLEL_API_KEY=(.+)$/m);
|
|
269
296
|
if (match) {
|
|
270
297
|
apiKey = match[1].trim();
|
|
@@ -273,17 +300,7 @@ async function getApiKey(): Promise<string> {
|
|
|
273
300
|
|
|
274
301
|
if (apiKey) {
|
|
275
302
|
console.log("🔑 Using API key from environment");
|
|
276
|
-
|
|
277
|
-
try {
|
|
278
|
-
await secrets.set({
|
|
279
|
-
service: SECRETS_SERVICE,
|
|
280
|
-
name: SECRETS_KEY,
|
|
281
|
-
value: apiKey,
|
|
282
|
-
});
|
|
283
|
-
console.log("💾 API key stored in keychain for future use");
|
|
284
|
-
} catch (error) {
|
|
285
|
-
console.warn("⚠️ Could not store API key in keychain:", error.message);
|
|
286
|
-
}
|
|
303
|
+
storeApiKey(apiKey);
|
|
287
304
|
return apiKey;
|
|
288
305
|
}
|
|
289
306
|
|
|
@@ -292,57 +309,55 @@ async function getApiKey(): Promise<string> {
|
|
|
292
309
|
const oauth = new OAuth();
|
|
293
310
|
const newApiKey = await oauth.getApiKey();
|
|
294
311
|
|
|
295
|
-
|
|
296
|
-
try {
|
|
297
|
-
await secrets.set({
|
|
298
|
-
service: SECRETS_SERVICE,
|
|
299
|
-
name: SECRETS_KEY,
|
|
300
|
-
value: newApiKey,
|
|
301
|
-
});
|
|
302
|
-
console.log("💾 API key stored securely in keychain");
|
|
303
|
-
} catch (error) {
|
|
304
|
-
console.warn("⚠️ Could not store API key in keychain:", error.message);
|
|
305
|
-
console.log(
|
|
306
|
-
"💡 You may need to set PARALLEL_API_KEY environment variable for future runs"
|
|
307
|
-
);
|
|
308
|
-
}
|
|
309
|
-
|
|
312
|
+
storeApiKey(newApiKey);
|
|
310
313
|
return newApiKey;
|
|
311
314
|
}
|
|
312
315
|
|
|
313
|
-
|
|
314
|
-
|
|
316
|
+
/**
|
|
317
|
+
* Load manifest file
|
|
318
|
+
* @param {string} outDir - Output directory
|
|
319
|
+
* @returns {Manifest} The manifest object
|
|
320
|
+
*/
|
|
321
|
+
function loadManifest(outDir) {
|
|
322
|
+
const manifestPath = path.join(outDir, "llmtext-manifest.json");
|
|
315
323
|
|
|
316
|
-
if (!existsSync(manifestPath)) {
|
|
324
|
+
if (!fs.existsSync(manifestPath)) {
|
|
317
325
|
return { files: [], timestamp: new Date().toISOString() };
|
|
318
326
|
}
|
|
319
327
|
|
|
320
328
|
try {
|
|
321
|
-
return JSON.parse(readFileSync(manifestPath, "utf8"));
|
|
329
|
+
return JSON.parse(fs.readFileSync(manifestPath, "utf8"));
|
|
322
330
|
} catch {
|
|
323
331
|
return { files: [], timestamp: new Date().toISOString() };
|
|
324
332
|
}
|
|
325
333
|
}
|
|
326
334
|
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
335
|
+
/**
|
|
336
|
+
* Save manifest file
|
|
337
|
+
* @param {string} outDir - Output directory
|
|
338
|
+
* @param {Manifest} manifest - The manifest to save
|
|
339
|
+
*/
|
|
340
|
+
function saveManifest(outDir, manifest) {
|
|
341
|
+
const manifestPath = path.join(outDir, "llmtext-manifest.json");
|
|
342
|
+
fs.writeFileSync(manifestPath, JSON.stringify(manifest, null, 2));
|
|
330
343
|
}
|
|
331
344
|
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
345
|
+
/**
|
|
346
|
+
* Clean up old files that are no longer generated
|
|
347
|
+
* @param {string} outDir - Output directory
|
|
348
|
+
* @param {string[]} currentFiles - Currently generated files
|
|
349
|
+
* @param {string[]} previousFiles - Previously generated files
|
|
350
|
+
*/
|
|
351
|
+
function cleanupOldFiles(outDir, currentFiles, previousFiles) {
|
|
337
352
|
const filesToRemove = previousFiles.filter(
|
|
338
353
|
(file) => !currentFiles.includes(file)
|
|
339
354
|
);
|
|
340
355
|
|
|
341
356
|
for (const file of filesToRemove) {
|
|
342
|
-
const filePath = join(outDir, file);
|
|
357
|
+
const filePath = path.join(outDir, file);
|
|
343
358
|
try {
|
|
344
|
-
if (existsSync(filePath)) {
|
|
345
|
-
rmSync(filePath);
|
|
359
|
+
if (fs.existsSync(filePath)) {
|
|
360
|
+
fs.rmSync(filePath);
|
|
346
361
|
console.log(`🗑️ Removed old file: ${file}`);
|
|
347
362
|
}
|
|
348
363
|
} catch (error) {
|
|
@@ -351,18 +366,20 @@ function cleanupOldFiles(
|
|
|
351
366
|
}
|
|
352
367
|
}
|
|
353
368
|
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
369
|
+
/**
|
|
370
|
+
* Process custom URLs through extraction API
|
|
371
|
+
* @param {Array<{title: string, description: string, url: string}>} customUrls - Custom URLs to process
|
|
372
|
+
* @param {string} apiKey - API key for authentication
|
|
373
|
+
* @param {boolean} forceExtract - Whether to force extraction
|
|
374
|
+
* @returns {Promise<Record<string, any>>} Extracted files
|
|
375
|
+
*/
|
|
376
|
+
async function processCustomUrls(customUrls, apiKey, forceExtract) {
|
|
377
|
+
const files = {};
|
|
360
378
|
|
|
361
379
|
for (const customUrl of customUrls) {
|
|
362
380
|
console.log(`📄 Processing custom URL: ${customUrl.url}`);
|
|
363
381
|
|
|
364
382
|
try {
|
|
365
|
-
// For custom URLs, we need to extract them individually
|
|
366
383
|
const response = await fetch("https://api.parallel.ai/v1beta/extract", {
|
|
367
384
|
method: "POST",
|
|
368
385
|
headers: {
|
|
@@ -393,6 +410,8 @@ async function processCustomUrls(
|
|
|
393
410
|
tokens: Math.round((extracted.full_content || "").length / 5),
|
|
394
411
|
};
|
|
395
412
|
}
|
|
413
|
+
} else {
|
|
414
|
+
throw new Error(`${response.status} - ${await response.statusText()}`);
|
|
396
415
|
}
|
|
397
416
|
} catch (error) {
|
|
398
417
|
console.error(
|
|
@@ -405,16 +424,14 @@ async function processCustomUrls(
|
|
|
405
424
|
return files;
|
|
406
425
|
}
|
|
407
426
|
|
|
408
|
-
|
|
409
|
-
|
|
427
|
+
/**
|
|
428
|
+
* Clear stored API key credentials
|
|
429
|
+
*/
|
|
430
|
+
async function clearCredentials() {
|
|
410
431
|
try {
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
});
|
|
415
|
-
|
|
416
|
-
if (deleted) {
|
|
417
|
-
console.log("✅ Cleared stored API key from keychain");
|
|
432
|
+
if (fs.existsSync(API_KEY_FILE)) {
|
|
433
|
+
fs.unlinkSync(API_KEY_FILE);
|
|
434
|
+
console.log("✅ Cleared stored API key from ~/.llmtext/api-key");
|
|
418
435
|
} else {
|
|
419
436
|
console.log("ℹ️ No stored API key found to clear");
|
|
420
437
|
}
|
|
@@ -423,6 +440,9 @@ async function clearCredentials(): Promise<void> {
|
|
|
423
440
|
}
|
|
424
441
|
}
|
|
425
442
|
|
|
443
|
+
/**
|
|
444
|
+
* Main function
|
|
445
|
+
*/
|
|
426
446
|
async function main() {
|
|
427
447
|
console.log("🚀 Extract from Sitemap CLI");
|
|
428
448
|
|
|
@@ -438,11 +458,11 @@ async function main() {
|
|
|
438
458
|
const apiKey = await getApiKey();
|
|
439
459
|
|
|
440
460
|
// Ensure output directory exists
|
|
441
|
-
mkdirSync(config.outDir, { recursive: true });
|
|
461
|
+
fs.mkdirSync(config.outDir, { recursive: true });
|
|
442
462
|
|
|
443
463
|
// Load previous manifest
|
|
444
464
|
const previousManifest = loadManifest(config.outDir);
|
|
445
|
-
const currentFiles
|
|
465
|
+
const currentFiles = [];
|
|
446
466
|
|
|
447
467
|
let totalTokens = 0;
|
|
448
468
|
let totalPages = 0;
|
|
@@ -467,29 +487,29 @@ async function main() {
|
|
|
467
487
|
}
|
|
468
488
|
|
|
469
489
|
// Write files to disk
|
|
470
|
-
for (const [
|
|
471
|
-
let filename =
|
|
490
|
+
for (const [filePath, file] of Object.entries(result.files)) {
|
|
491
|
+
let filename = filePath;
|
|
472
492
|
|
|
473
493
|
if (!config.keepOriginalUrls) {
|
|
474
494
|
// Create domain-specific subdirectory
|
|
475
495
|
const domain = new URL(
|
|
476
496
|
origin.startsWith("http") ? origin : `https://${origin}`
|
|
477
497
|
).hostname;
|
|
478
|
-
const domainDir = join(config.outDir, domain);
|
|
479
|
-
mkdirSync(domainDir, { recursive: true });
|
|
480
|
-
filename = join(
|
|
498
|
+
const domainDir = path.join(config.outDir, domain);
|
|
499
|
+
fs.mkdirSync(domainDir, { recursive: true });
|
|
500
|
+
filename = path.join(
|
|
481
501
|
domain,
|
|
482
|
-
|
|
502
|
+
filePath.startsWith("/") ? filePath.slice(1) : filePath
|
|
483
503
|
);
|
|
484
504
|
} else {
|
|
485
|
-
filename =
|
|
505
|
+
filename = filePath.startsWith("/") ? filePath.slice(1) : filePath;
|
|
486
506
|
}
|
|
487
507
|
|
|
488
|
-
const
|
|
489
|
-
const fileDir = dirname(
|
|
508
|
+
const fullFilePath = path.join(config.outDir, filename);
|
|
509
|
+
const fileDir = path.dirname(fullFilePath);
|
|
490
510
|
|
|
491
|
-
mkdirSync(fileDir, { recursive: true });
|
|
492
|
-
writeFileSync(
|
|
511
|
+
fs.mkdirSync(fileDir, { recursive: true });
|
|
512
|
+
fs.writeFileSync(fullFilePath, file.content);
|
|
493
513
|
currentFiles.push(filename);
|
|
494
514
|
|
|
495
515
|
console.log(`📝 Wrote: ${filename} (${file.tokens} tokens)`);
|
|
@@ -514,8 +534,8 @@ async function main() {
|
|
|
514
534
|
);
|
|
515
535
|
|
|
516
536
|
for (const [filename, file] of Object.entries(customFiles)) {
|
|
517
|
-
const filePath = join(config.outDir, filename);
|
|
518
|
-
writeFileSync(filePath, file.content);
|
|
537
|
+
const filePath = path.join(config.outDir, filename);
|
|
538
|
+
fs.writeFileSync(filePath, file.content);
|
|
519
539
|
currentFiles.push(filename);
|
|
520
540
|
totalTokens += file.tokens;
|
|
521
541
|
totalPages++;
|
|
@@ -530,25 +550,34 @@ async function main() {
|
|
|
530
550
|
}
|
|
531
551
|
|
|
532
552
|
// Save new manifest
|
|
533
|
-
const newManifest
|
|
553
|
+
const newManifest = {
|
|
534
554
|
files: currentFiles,
|
|
535
555
|
timestamp: new Date().toISOString(),
|
|
536
556
|
};
|
|
537
557
|
saveManifest(config.outDir, newManifest);
|
|
538
558
|
|
|
539
|
-
console.log(
|
|
559
|
+
console.log("\n✨ Extraction completed!");
|
|
540
560
|
console.log(`📊 Total: ${totalPages} pages, ${totalTokens} tokens`);
|
|
541
561
|
if (totalErrors > 0) {
|
|
542
562
|
console.log(`⚠️ Errors: ${totalErrors}`);
|
|
543
563
|
}
|
|
544
|
-
console.log(`📁 Output directory: ${resolve(config.outDir)}`);
|
|
545
|
-
console.log(
|
|
564
|
+
console.log(`📁 Output directory: ${path.resolve(config.outDir)}`);
|
|
565
|
+
console.log("\n💡 Use --clear-credentials to remove stored API key");
|
|
546
566
|
} catch (error) {
|
|
547
567
|
console.error("💥 Fatal error:", error.message);
|
|
548
568
|
process.exit(1);
|
|
549
569
|
}
|
|
550
570
|
}
|
|
551
571
|
|
|
552
|
-
|
|
572
|
+
// Run main function if this file is executed directly
|
|
573
|
+
if (require.main === module) {
|
|
553
574
|
main();
|
|
554
575
|
}
|
|
576
|
+
|
|
577
|
+
module.exports = {
|
|
578
|
+
OAuth,
|
|
579
|
+
loadConfig,
|
|
580
|
+
getApiKey,
|
|
581
|
+
clearCredentials,
|
|
582
|
+
main,
|
|
583
|
+
};
|
package/package.json
CHANGED
|
@@ -1,16 +1,15 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "extract-from-sitemap",
|
|
3
|
-
"bin": "cli.
|
|
4
|
-
"version": "0.0.
|
|
3
|
+
"bin": "cli.js",
|
|
4
|
+
"version": "0.0.4",
|
|
5
5
|
"main": "mod.js",
|
|
6
6
|
"description": "A module and CLI that allows extracting all pages from a sitemap into markdown and a llms.txt, using Parallel.ai APIs.",
|
|
7
7
|
"files": [
|
|
8
8
|
"mod.js",
|
|
9
|
-
"cli.
|
|
9
|
+
"cli.js"
|
|
10
10
|
],
|
|
11
11
|
"license": "MIT",
|
|
12
12
|
"devDependencies": {
|
|
13
|
-
"@cloudflare/workers-types": "4.20251011.0"
|
|
14
|
-
"@types/bun": "1.3.0"
|
|
13
|
+
"@cloudflare/workers-types": "4.20251011.0"
|
|
15
14
|
}
|
|
16
15
|
}
|