extract-from-sitemap 0.0.1 ā 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/cli.js +602 -0
- package/package.json +4 -5
- package/cli.ts +0 -434
package/cli.js
ADDED
|
@@ -0,0 +1,602 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
const fs = require("fs");
|
|
4
|
+
const path = require("path");
|
|
5
|
+
const { spawn } = require("child_process");
|
|
6
|
+
const crypto = require("crypto");
|
|
7
|
+
const http = require("http");
|
|
8
|
+
const { URL, URLSearchParams } = require("url");
|
|
9
|
+
const os = require("os");
|
|
10
|
+
|
|
11
|
+
/**
|
|
12
|
+
* @typedef {Object} Config
|
|
13
|
+
* @property {string} outDir - Output directory for extracted files
|
|
14
|
+
* @property {string[]} origins - Array of origin URLs to process
|
|
15
|
+
* @property {Array<{title: string, description: string, url: string}>} customUrls - Custom URLs to extract
|
|
16
|
+
* @property {boolean} keepOriginalUrls - Whether to keep original URL structure
|
|
17
|
+
* @property {boolean} forceExtract - Whether to force extraction even if files exist
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
/**
|
|
21
|
+
* @typedef {Object} Manifest
|
|
22
|
+
* @property {string[]} files - List of generated files
|
|
23
|
+
* @property {string} timestamp - Timestamp of last generation
|
|
24
|
+
*/
|
|
25
|
+
|
|
26
|
+
const CREDENTIALS_DIR = path.join(os.homedir(), ".llmtext");
|
|
27
|
+
const API_KEY_FILE = path.join(CREDENTIALS_DIR, "api-key");
|
|
28
|
+
|
|
29
|
+
/**
|
|
30
|
+
* OAuth handler for Parallel.ai API key authentication
|
|
31
|
+
*/
|
|
32
|
+
class OAuth {
|
|
33
|
+
constructor() {
|
|
34
|
+
this.clientId = "extract-from-sitemap-cli";
|
|
35
|
+
this.redirectUri = "http://localhost:3737/callback";
|
|
36
|
+
this.scope = "key:read";
|
|
37
|
+
this.server = null;
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
/**
|
|
41
|
+
* Get API key through OAuth flow
|
|
42
|
+
* @returns {Promise<string>} The API key
|
|
43
|
+
*/
|
|
44
|
+
async getApiKey() {
|
|
45
|
+
console.log("š Starting OAuth flow...");
|
|
46
|
+
|
|
47
|
+
// Generate PKCE parameters
|
|
48
|
+
const { codeVerifier, codeChallenge } = await this.generatePKCE();
|
|
49
|
+
|
|
50
|
+
// Build authorization URL
|
|
51
|
+
const authUrl = new URL("https://platform.parallel.ai/getKeys/authorize");
|
|
52
|
+
authUrl.searchParams.set("client_id", this.clientId);
|
|
53
|
+
authUrl.searchParams.set("redirect_uri", this.redirectUri);
|
|
54
|
+
authUrl.searchParams.set("response_type", "code");
|
|
55
|
+
authUrl.searchParams.set("scope", this.scope);
|
|
56
|
+
authUrl.searchParams.set("code_challenge", codeChallenge);
|
|
57
|
+
authUrl.searchParams.set("code_challenge_method", "S256");
|
|
58
|
+
authUrl.searchParams.set("state", Math.random().toString(36));
|
|
59
|
+
|
|
60
|
+
console.log("\nš Opening browser for authorization...");
|
|
61
|
+
|
|
62
|
+
// Open browser automatically
|
|
63
|
+
await this.openBrowser(authUrl.toString());
|
|
64
|
+
|
|
65
|
+
// Start simple HTTP server to catch the callback
|
|
66
|
+
const code = await this.startCallbackServer();
|
|
67
|
+
|
|
68
|
+
// Exchange code for token
|
|
69
|
+
console.log("š Exchanging authorization code for API key...");
|
|
70
|
+
|
|
71
|
+
const response = await fetch("https://platform.parallel.ai/getKeys/token", {
|
|
72
|
+
method: "POST",
|
|
73
|
+
headers: { "Content-Type": "application/x-www-form-urlencoded" },
|
|
74
|
+
body: new URLSearchParams({
|
|
75
|
+
grant_type: "authorization_code",
|
|
76
|
+
code: code,
|
|
77
|
+
client_id: this.clientId,
|
|
78
|
+
redirect_uri: this.redirectUri,
|
|
79
|
+
code_verifier: codeVerifier,
|
|
80
|
+
}),
|
|
81
|
+
});
|
|
82
|
+
|
|
83
|
+
if (!response.ok) {
|
|
84
|
+
throw new Error(
|
|
85
|
+
`Token exchange failed: ${response.status} ${response.statusText}`
|
|
86
|
+
);
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
const { access_token } = await response.json();
|
|
90
|
+
console.log("ā
Successfully obtained API key!");
|
|
91
|
+
|
|
92
|
+
return access_token;
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
/**
|
|
96
|
+
* Open browser to authorization URL
|
|
97
|
+
* @param {string} url - The authorization URL
|
|
98
|
+
*/
|
|
99
|
+
async openBrowser(url) {
|
|
100
|
+
try {
|
|
101
|
+
const platform = process.platform;
|
|
102
|
+
let command, args;
|
|
103
|
+
|
|
104
|
+
if (platform === "darwin") {
|
|
105
|
+
command = "open";
|
|
106
|
+
args = [url];
|
|
107
|
+
} else if (platform === "win32") {
|
|
108
|
+
command = "start";
|
|
109
|
+
args = ["", url];
|
|
110
|
+
} else {
|
|
111
|
+
// Linux/Unix
|
|
112
|
+
command = "xdg-open";
|
|
113
|
+
args = [url];
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
spawn(command, args, { detached: true, stdio: "ignore" });
|
|
117
|
+
} catch (error) {
|
|
118
|
+
console.log("\nš Please visit this URL to authorize the application:");
|
|
119
|
+
console.log(`${url}\n`);
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
/**
|
|
124
|
+
* Generate PKCE code verifier and challenge
|
|
125
|
+
* @returns {Promise<{codeVerifier: string, codeChallenge: string}>}
|
|
126
|
+
*/
|
|
127
|
+
async generatePKCE() {
|
|
128
|
+
const codeVerifier = crypto.randomBytes(32).toString("base64url");
|
|
129
|
+
const hash = crypto
|
|
130
|
+
.createHash("sha256")
|
|
131
|
+
.update(codeVerifier)
|
|
132
|
+
.digest("base64url");
|
|
133
|
+
|
|
134
|
+
return {
|
|
135
|
+
codeVerifier,
|
|
136
|
+
codeChallenge: hash,
|
|
137
|
+
};
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
/**
|
|
141
|
+
* Start HTTP server to catch OAuth callback
|
|
142
|
+
* @returns {Promise<string>} The authorization code
|
|
143
|
+
*/
|
|
144
|
+
async startCallbackServer() {
|
|
145
|
+
return new Promise((resolve, reject) => {
|
|
146
|
+
this.server = http.createServer((req, res) => {
|
|
147
|
+
const url = new URL(req.url, `http://${req.headers.host}`);
|
|
148
|
+
|
|
149
|
+
if (url.pathname === "/callback") {
|
|
150
|
+
const code = url.searchParams.get("code");
|
|
151
|
+
const error = url.searchParams.get("error");
|
|
152
|
+
|
|
153
|
+
if (error) {
|
|
154
|
+
reject(new Error(`OAuth error: ${error}`));
|
|
155
|
+
res.writeHead(400);
|
|
156
|
+
res.end("Error occurred. You can close this window.");
|
|
157
|
+
return;
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
if (code) {
|
|
161
|
+
resolve(code);
|
|
162
|
+
res.writeHead(200, { "Content-Type": "text/html" });
|
|
163
|
+
res.end(
|
|
164
|
+
"ā
Authorization successful! You can close this window and return to the terminal."
|
|
165
|
+
);
|
|
166
|
+
return;
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
res.writeHead(404);
|
|
171
|
+
res.end("Invalid request");
|
|
172
|
+
});
|
|
173
|
+
|
|
174
|
+
this.server.listen(3737);
|
|
175
|
+
|
|
176
|
+
// Timeout after 5 minutes
|
|
177
|
+
setTimeout(() => {
|
|
178
|
+
this.stopServer();
|
|
179
|
+
reject(new Error("OAuth flow timed out"));
|
|
180
|
+
}, 300000);
|
|
181
|
+
}).finally(() => {
|
|
182
|
+
this.stopServer();
|
|
183
|
+
});
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
/**
|
|
187
|
+
* Stop the callback server
|
|
188
|
+
*/
|
|
189
|
+
stopServer() {
|
|
190
|
+
if (this.server) {
|
|
191
|
+
this.server.close();
|
|
192
|
+
this.server = null;
|
|
193
|
+
}
|
|
194
|
+
}
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
/**
|
|
198
|
+
* Load configuration from llmtext.json
|
|
199
|
+
* @returns {Promise<Config>} The configuration object
|
|
200
|
+
*/
|
|
201
|
+
async function loadConfig() {
|
|
202
|
+
const configPath = path.resolve("llmtext.json");
|
|
203
|
+
|
|
204
|
+
if (!fs.existsSync(configPath)) {
|
|
205
|
+
console.error(
|
|
206
|
+
"ā llmtext.json not found. Please create a configuration file."
|
|
207
|
+
);
|
|
208
|
+
console.log("\nExample llmtext.json:");
|
|
209
|
+
console.log(
|
|
210
|
+
JSON.stringify(
|
|
211
|
+
{
|
|
212
|
+
outDir: "./docs",
|
|
213
|
+
origins: ["https://docs.example.com"],
|
|
214
|
+
customUrls: [],
|
|
215
|
+
keepOriginalUrls: false,
|
|
216
|
+
forceExtract: false,
|
|
217
|
+
},
|
|
218
|
+
null,
|
|
219
|
+
2
|
|
220
|
+
)
|
|
221
|
+
);
|
|
222
|
+
process.exit(1);
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
try {
|
|
226
|
+
const config = JSON.parse(fs.readFileSync(configPath, "utf8"));
|
|
227
|
+
|
|
228
|
+
// Validate required fields
|
|
229
|
+
if (!config.outDir) throw new Error("outDir is required");
|
|
230
|
+
if (!Array.isArray(config.origins))
|
|
231
|
+
throw new Error("origins must be an array");
|
|
232
|
+
|
|
233
|
+
// Set defaults
|
|
234
|
+
config.customUrls = config.customUrls || [];
|
|
235
|
+
config.keepOriginalUrls = config.keepOriginalUrls ?? false;
|
|
236
|
+
config.forceExtract = config.forceExtract ?? false;
|
|
237
|
+
|
|
238
|
+
return config;
|
|
239
|
+
} catch (error) {
|
|
240
|
+
console.error("ā Error reading llmtext.json:", error.message);
|
|
241
|
+
process.exit(1);
|
|
242
|
+
}
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
/**
|
|
246
|
+
* Store API key in ~/.llmtext/api-key
|
|
247
|
+
* @param {string} apiKey - The API key to store
|
|
248
|
+
*/
|
|
249
|
+
function storeApiKey(apiKey) {
|
|
250
|
+
try {
|
|
251
|
+
fs.mkdirSync(CREDENTIALS_DIR, { recursive: true });
|
|
252
|
+
fs.writeFileSync(API_KEY_FILE, apiKey, { mode: 0o600 }); // Only owner can read
|
|
253
|
+
console.log("š¾ API key stored securely in ~/.llmtext/api-key");
|
|
254
|
+
} catch (error) {
|
|
255
|
+
console.warn("ā ļø Could not store API key:", error.message);
|
|
256
|
+
}
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
/**
|
|
260
|
+
* Load API key from ~/.llmtext/api-key
|
|
261
|
+
* @returns {string|null} The stored API key or null if not found
|
|
262
|
+
*/
|
|
263
|
+
function loadStoredApiKey() {
|
|
264
|
+
try {
|
|
265
|
+
if (fs.existsSync(API_KEY_FILE)) {
|
|
266
|
+
const apiKey = fs.readFileSync(API_KEY_FILE, "utf8").trim();
|
|
267
|
+
if (apiKey) {
|
|
268
|
+
console.log("š Using stored API key from ~/.llmtext/api-key");
|
|
269
|
+
return apiKey;
|
|
270
|
+
}
|
|
271
|
+
}
|
|
272
|
+
} catch (error) {
|
|
273
|
+
console.warn("ā ļø Could not read stored API key:", error.message);
|
|
274
|
+
}
|
|
275
|
+
return null;
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
/**
|
|
279
|
+
* Get API key from various sources or start OAuth flow
|
|
280
|
+
* @returns {Promise<string>} The API key
|
|
281
|
+
*/
|
|
282
|
+
async function getApiKey() {
|
|
283
|
+
// Check stored API key first
|
|
284
|
+
const storedKey = loadStoredApiKey();
|
|
285
|
+
if (storedKey) {
|
|
286
|
+
return storedKey;
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
// Check environment variables
|
|
290
|
+
let apiKey = process.env.PARALLEL_API_KEY;
|
|
291
|
+
|
|
292
|
+
if (!apiKey && fs.existsSync(".env")) {
|
|
293
|
+
// Try to load from .env file
|
|
294
|
+
const envContent = fs.readFileSync(".env", "utf8");
|
|
295
|
+
const match = envContent.match(/^PARALLEL_API_KEY=(.+)$/m);
|
|
296
|
+
if (match) {
|
|
297
|
+
apiKey = match[1].trim();
|
|
298
|
+
}
|
|
299
|
+
}
|
|
300
|
+
|
|
301
|
+
if (apiKey) {
|
|
302
|
+
console.log("š Using API key from environment");
|
|
303
|
+
storeApiKey(apiKey);
|
|
304
|
+
return apiKey;
|
|
305
|
+
}
|
|
306
|
+
|
|
307
|
+
// No API key found, start OAuth flow
|
|
308
|
+
console.log("š No API key found. Starting OAuth flow...");
|
|
309
|
+
const oauth = new OAuth();
|
|
310
|
+
const newApiKey = await oauth.getApiKey();
|
|
311
|
+
|
|
312
|
+
storeApiKey(newApiKey);
|
|
313
|
+
return newApiKey;
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
/**
|
|
317
|
+
* Load manifest file
|
|
318
|
+
* @param {string} outDir - Output directory
|
|
319
|
+
* @returns {Manifest} The manifest object
|
|
320
|
+
*/
|
|
321
|
+
function loadManifest(outDir) {
|
|
322
|
+
const manifestPath = path.join(outDir, "llmtext-manifest.json");
|
|
323
|
+
|
|
324
|
+
if (!fs.existsSync(manifestPath)) {
|
|
325
|
+
return { files: [], timestamp: new Date().toISOString() };
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
try {
|
|
329
|
+
return JSON.parse(fs.readFileSync(manifestPath, "utf8"));
|
|
330
|
+
} catch {
|
|
331
|
+
return { files: [], timestamp: new Date().toISOString() };
|
|
332
|
+
}
|
|
333
|
+
}
|
|
334
|
+
|
|
335
|
+
/**
|
|
336
|
+
* Save manifest file
|
|
337
|
+
* @param {string} outDir - Output directory
|
|
338
|
+
* @param {Manifest} manifest - The manifest to save
|
|
339
|
+
*/
|
|
340
|
+
function saveManifest(outDir, manifest) {
|
|
341
|
+
const manifestPath = path.join(outDir, "llmtext-manifest.json");
|
|
342
|
+
fs.writeFileSync(manifestPath, JSON.stringify(manifest, null, 2));
|
|
343
|
+
}
|
|
344
|
+
|
|
345
|
+
/**
|
|
346
|
+
* Clean up old files that are no longer generated
|
|
347
|
+
* @param {string} outDir - Output directory
|
|
348
|
+
* @param {string[]} currentFiles - Currently generated files
|
|
349
|
+
* @param {string[]} previousFiles - Previously generated files
|
|
350
|
+
*/
|
|
351
|
+
function cleanupOldFiles(outDir, currentFiles, previousFiles) {
|
|
352
|
+
const filesToRemove = previousFiles.filter(
|
|
353
|
+
(file) => !currentFiles.includes(file)
|
|
354
|
+
);
|
|
355
|
+
|
|
356
|
+
for (const file of filesToRemove) {
|
|
357
|
+
const filePath = path.join(outDir, file);
|
|
358
|
+
try {
|
|
359
|
+
if (fs.existsSync(filePath)) {
|
|
360
|
+
fs.rmSync(filePath);
|
|
361
|
+
console.log(`šļø Removed old file: ${file}`);
|
|
362
|
+
}
|
|
363
|
+
} catch (error) {
|
|
364
|
+
console.warn(`ā ļø Could not remove ${file}:`, error.message);
|
|
365
|
+
}
|
|
366
|
+
}
|
|
367
|
+
}
|
|
368
|
+
|
|
369
|
+
/**
|
|
370
|
+
* Process custom URLs through extraction API
|
|
371
|
+
* @param {Array<{title: string, description: string, url: string}>} customUrls - Custom URLs to process
|
|
372
|
+
* @param {string} apiKey - API key for authentication
|
|
373
|
+
* @param {boolean} forceExtract - Whether to force extraction
|
|
374
|
+
* @returns {Promise<Record<string, any>>} Extracted files
|
|
375
|
+
*/
|
|
376
|
+
async function processCustomUrls(customUrls, apiKey, forceExtract) {
|
|
377
|
+
const files = {};
|
|
378
|
+
|
|
379
|
+
for (const customUrl of customUrls) {
|
|
380
|
+
console.log(`š Processing custom URL: ${customUrl.url}`);
|
|
381
|
+
|
|
382
|
+
try {
|
|
383
|
+
const response = await fetch("https://api.parallel.ai/v1beta/extract", {
|
|
384
|
+
method: "POST",
|
|
385
|
+
headers: {
|
|
386
|
+
"Content-Type": "application/json",
|
|
387
|
+
"parallel-beta": "search-extract-2025-10-10",
|
|
388
|
+
"x-api-key": apiKey,
|
|
389
|
+
},
|
|
390
|
+
body: JSON.stringify({
|
|
391
|
+
urls: [customUrl.url],
|
|
392
|
+
full_content: true,
|
|
393
|
+
}),
|
|
394
|
+
});
|
|
395
|
+
|
|
396
|
+
if (response.ok) {
|
|
397
|
+
const result = await response.json();
|
|
398
|
+
if (result.results && result.results.length > 0) {
|
|
399
|
+
const extracted = result.results[0];
|
|
400
|
+
const filename =
|
|
401
|
+
customUrl.title.replace(/[^a-zA-Z0-9]/g, "_").toLowerCase() + ".md";
|
|
402
|
+
|
|
403
|
+
files[filename] = {
|
|
404
|
+
content: extracted.full_content || "",
|
|
405
|
+
title: customUrl.title,
|
|
406
|
+
description: customUrl.description,
|
|
407
|
+
extracted: true,
|
|
408
|
+
publishedDate: extracted.published_date || "",
|
|
409
|
+
status: 200,
|
|
410
|
+
tokens: Math.round((extracted.full_content || "").length / 5),
|
|
411
|
+
};
|
|
412
|
+
}
|
|
413
|
+
}
|
|
414
|
+
} catch (error) {
|
|
415
|
+
console.error(
|
|
416
|
+
`ā Error processing custom URL ${customUrl.url}:`,
|
|
417
|
+
error.message
|
|
418
|
+
);
|
|
419
|
+
}
|
|
420
|
+
}
|
|
421
|
+
|
|
422
|
+
return files;
|
|
423
|
+
}
|
|
424
|
+
|
|
425
|
+
/**
|
|
426
|
+
* Clear stored API key credentials
|
|
427
|
+
*/
|
|
428
|
+
async function clearCredentials() {
|
|
429
|
+
try {
|
|
430
|
+
if (fs.existsSync(API_KEY_FILE)) {
|
|
431
|
+
fs.unlinkSync(API_KEY_FILE);
|
|
432
|
+
console.log("ā
Cleared stored API key from ~/.llmtext/api-key");
|
|
433
|
+
} else {
|
|
434
|
+
console.log("ā¹ļø No stored API key found to clear");
|
|
435
|
+
}
|
|
436
|
+
} catch (error) {
|
|
437
|
+
console.error("ā Error clearing credentials:", error.message);
|
|
438
|
+
}
|
|
439
|
+
}
|
|
440
|
+
|
|
441
|
+
/**
|
|
442
|
+
* Extract content from sitemap (placeholder - you'll need to implement this)
|
|
443
|
+
* @param {string} origin - The origin URL
|
|
444
|
+
* @param {boolean} forceExtract - Whether to force extraction
|
|
445
|
+
* @param {string} apiKey - API key for authentication
|
|
446
|
+
* @returns {Promise<{totalPages: number, totalTokens: number, errors: number, files: Record<string, any>}>}
|
|
447
|
+
*/
|
|
448
|
+
async function extractFromSitemap(origin, forceExtract, apiKey) {
|
|
449
|
+
// This is a placeholder - you'll need to implement the actual extraction logic
|
|
450
|
+
// or import it from your mod.js file
|
|
451
|
+
console.log(`Extracting from ${origin} (force: ${forceExtract})`);
|
|
452
|
+
|
|
453
|
+
// For now, return empty result
|
|
454
|
+
return {
|
|
455
|
+
totalPages: 0,
|
|
456
|
+
totalTokens: 0,
|
|
457
|
+
errors: 0,
|
|
458
|
+
files: {},
|
|
459
|
+
};
|
|
460
|
+
}
|
|
461
|
+
|
|
462
|
+
/**
|
|
463
|
+
* Main function
|
|
464
|
+
*/
|
|
465
|
+
async function main() {
|
|
466
|
+
console.log("š Extract from Sitemap CLI");
|
|
467
|
+
|
|
468
|
+
// Check for special commands
|
|
469
|
+
const args = process.argv.slice(2);
|
|
470
|
+
if (args.includes("--clear-credentials")) {
|
|
471
|
+
await clearCredentials();
|
|
472
|
+
return;
|
|
473
|
+
}
|
|
474
|
+
|
|
475
|
+
try {
|
|
476
|
+
const config = await loadConfig();
|
|
477
|
+
const apiKey = await getApiKey();
|
|
478
|
+
|
|
479
|
+
// Ensure output directory exists
|
|
480
|
+
fs.mkdirSync(config.outDir, { recursive: true });
|
|
481
|
+
|
|
482
|
+
// Load previous manifest
|
|
483
|
+
const previousManifest = loadManifest(config.outDir);
|
|
484
|
+
const currentFiles = [];
|
|
485
|
+
|
|
486
|
+
let totalTokens = 0;
|
|
487
|
+
let totalPages = 0;
|
|
488
|
+
let totalErrors = 0;
|
|
489
|
+
|
|
490
|
+
// Process each origin
|
|
491
|
+
for (const origin of config.origins) {
|
|
492
|
+
console.log(`\nš Processing origin: ${origin}`);
|
|
493
|
+
|
|
494
|
+
try {
|
|
495
|
+
const result = await extractFromSitemap(
|
|
496
|
+
origin,
|
|
497
|
+
config.forceExtract,
|
|
498
|
+
apiKey
|
|
499
|
+
);
|
|
500
|
+
|
|
501
|
+
console.log(
|
|
502
|
+
`ā
Extracted ${result.totalPages} pages with ${result.totalTokens} tokens`
|
|
503
|
+
);
|
|
504
|
+
if (result.errors > 0) {
|
|
505
|
+
console.log(`ā ļø ${result.errors} errors occurred`);
|
|
506
|
+
}
|
|
507
|
+
|
|
508
|
+
// Write files to disk
|
|
509
|
+
for (const [filePath, file] of Object.entries(result.files)) {
|
|
510
|
+
let filename = filePath;
|
|
511
|
+
|
|
512
|
+
if (!config.keepOriginalUrls) {
|
|
513
|
+
// Create domain-specific subdirectory
|
|
514
|
+
const domain = new URL(
|
|
515
|
+
origin.startsWith("http") ? origin : `https://${origin}`
|
|
516
|
+
).hostname;
|
|
517
|
+
const domainDir = path.join(config.outDir, domain);
|
|
518
|
+
fs.mkdirSync(domainDir, { recursive: true });
|
|
519
|
+
filename = path.join(
|
|
520
|
+
domain,
|
|
521
|
+
filePath.startsWith("/") ? filePath.slice(1) : filePath
|
|
522
|
+
);
|
|
523
|
+
} else {
|
|
524
|
+
filename = filePath.startsWith("/") ? filePath.slice(1) : filePath;
|
|
525
|
+
}
|
|
526
|
+
|
|
527
|
+
const fullFilePath = path.join(config.outDir, filename);
|
|
528
|
+
const fileDir = path.dirname(fullFilePath);
|
|
529
|
+
|
|
530
|
+
fs.mkdirSync(fileDir, { recursive: true });
|
|
531
|
+
fs.writeFileSync(fullFilePath, file.content);
|
|
532
|
+
currentFiles.push(filename);
|
|
533
|
+
|
|
534
|
+
console.log(`š Wrote: ${filename} (${file.tokens} tokens)`);
|
|
535
|
+
}
|
|
536
|
+
|
|
537
|
+
totalTokens += result.totalTokens;
|
|
538
|
+
totalPages += result.totalPages;
|
|
539
|
+
totalErrors += result.errors;
|
|
540
|
+
} catch (error) {
|
|
541
|
+
console.error(`ā Error processing ${origin}:`, error.message);
|
|
542
|
+
totalErrors++;
|
|
543
|
+
}
|
|
544
|
+
}
|
|
545
|
+
|
|
546
|
+
// Process custom URLs
|
|
547
|
+
if (config.customUrls.length > 0) {
|
|
548
|
+
console.log(`\nš Processing ${config.customUrls.length} custom URLs...`);
|
|
549
|
+
const customFiles = await processCustomUrls(
|
|
550
|
+
config.customUrls,
|
|
551
|
+
apiKey,
|
|
552
|
+
config.forceExtract
|
|
553
|
+
);
|
|
554
|
+
|
|
555
|
+
for (const [filename, file] of Object.entries(customFiles)) {
|
|
556
|
+
const filePath = path.join(config.outDir, filename);
|
|
557
|
+
fs.writeFileSync(filePath, file.content);
|
|
558
|
+
currentFiles.push(filename);
|
|
559
|
+
totalTokens += file.tokens;
|
|
560
|
+
totalPages++;
|
|
561
|
+
|
|
562
|
+
console.log(`š Wrote: ${filename} (${file.tokens} tokens)`);
|
|
563
|
+
}
|
|
564
|
+
}
|
|
565
|
+
|
|
566
|
+
// Clean up old files
|
|
567
|
+
if (previousManifest.files.length > 0) {
|
|
568
|
+
cleanupOldFiles(config.outDir, currentFiles, previousManifest.files);
|
|
569
|
+
}
|
|
570
|
+
|
|
571
|
+
// Save new manifest
|
|
572
|
+
const newManifest = {
|
|
573
|
+
files: currentFiles,
|
|
574
|
+
timestamp: new Date().toISOString(),
|
|
575
|
+
};
|
|
576
|
+
saveManifest(config.outDir, newManifest);
|
|
577
|
+
|
|
578
|
+
console.log("\n⨠Extraction completed!");
|
|
579
|
+
console.log(`š Total: ${totalPages} pages, ${totalTokens} tokens`);
|
|
580
|
+
if (totalErrors > 0) {
|
|
581
|
+
console.log(`ā ļø Errors: ${totalErrors}`);
|
|
582
|
+
}
|
|
583
|
+
console.log(`š Output directory: ${path.resolve(config.outDir)}`);
|
|
584
|
+
console.log("\nš” Use --clear-credentials to remove stored API key");
|
|
585
|
+
} catch (error) {
|
|
586
|
+
console.error("š„ Fatal error:", error.message);
|
|
587
|
+
process.exit(1);
|
|
588
|
+
}
|
|
589
|
+
}
|
|
590
|
+
|
|
591
|
+
// Run main function if this file is executed directly
|
|
592
|
+
if (require.main === module) {
|
|
593
|
+
main();
|
|
594
|
+
}
|
|
595
|
+
|
|
596
|
+
module.exports = {
|
|
597
|
+
OAuth,
|
|
598
|
+
loadConfig,
|
|
599
|
+
getApiKey,
|
|
600
|
+
clearCredentials,
|
|
601
|
+
main,
|
|
602
|
+
};
|
package/package.json
CHANGED
|
@@ -1,16 +1,15 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "extract-from-sitemap",
|
|
3
|
-
"bin": "cli.
|
|
4
|
-
"version": "0.0.
|
|
3
|
+
"bin": "cli.js",
|
|
4
|
+
"version": "0.0.3",
|
|
5
5
|
"main": "mod.js",
|
|
6
6
|
"description": "A module and CLI that allows extracting all pages from a sitemap into markdown and a llms.txt, using Parallel.ai APIs.",
|
|
7
7
|
"files": [
|
|
8
8
|
"mod.js",
|
|
9
|
-
"cli.
|
|
9
|
+
"cli.js"
|
|
10
10
|
],
|
|
11
11
|
"license": "MIT",
|
|
12
12
|
"devDependencies": {
|
|
13
|
-
"@cloudflare/workers-types": "4.20251011.0"
|
|
14
|
-
"@types/bun": "1.3.0"
|
|
13
|
+
"@cloudflare/workers-types": "4.20251011.0"
|
|
15
14
|
}
|
|
16
15
|
}
|
package/cli.ts
DELETED
|
@@ -1,434 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env bun
|
|
2
|
-
/// <reference types="@types/bun" />
|
|
3
|
-
/// <reference lib="esnext" />
|
|
4
|
-
|
|
5
|
-
import {
|
|
6
|
-
existsSync,
|
|
7
|
-
readFileSync,
|
|
8
|
-
writeFileSync,
|
|
9
|
-
mkdirSync,
|
|
10
|
-
rmSync,
|
|
11
|
-
readdirSync,
|
|
12
|
-
} from "fs";
|
|
13
|
-
import { join, dirname, resolve } from "path";
|
|
14
|
-
import { extractFromSitemap } from "./mod.js";
|
|
15
|
-
|
|
16
|
-
interface Config {
|
|
17
|
-
outDir: string;
|
|
18
|
-
origins: string[];
|
|
19
|
-
customUrls: Array<{
|
|
20
|
-
title: string;
|
|
21
|
-
description: string;
|
|
22
|
-
url: string;
|
|
23
|
-
}>;
|
|
24
|
-
keepOriginalUrls: boolean;
|
|
25
|
-
forceExtract: boolean;
|
|
26
|
-
}
|
|
27
|
-
|
|
28
|
-
interface Manifest {
|
|
29
|
-
files: string[];
|
|
30
|
-
timestamp: string;
|
|
31
|
-
}
|
|
32
|
-
|
|
33
|
-
class OAuth {
|
|
34
|
-
private clientId: string;
|
|
35
|
-
private redirectUri: string;
|
|
36
|
-
private scope: string;
|
|
37
|
-
|
|
38
|
-
constructor() {
|
|
39
|
-
this.clientId = "extract-from-sitemap-cli";
|
|
40
|
-
this.redirectUri = "http://localhost:3737/callback";
|
|
41
|
-
this.scope = "key:read";
|
|
42
|
-
}
|
|
43
|
-
|
|
44
|
-
async getApiKey(): Promise<string> {
|
|
45
|
-
console.log("š Starting OAuth flow...");
|
|
46
|
-
|
|
47
|
-
// Generate PKCE parameters
|
|
48
|
-
const { codeVerifier, codeChallenge } = await this.generatePKCE();
|
|
49
|
-
|
|
50
|
-
// Build authorization URL
|
|
51
|
-
const authUrl = new URL("https://platform.parallel.ai/getKeys/authorize");
|
|
52
|
-
authUrl.searchParams.set("client_id", this.clientId);
|
|
53
|
-
authUrl.searchParams.set("redirect_uri", this.redirectUri);
|
|
54
|
-
authUrl.searchParams.set("response_type", "code");
|
|
55
|
-
authUrl.searchParams.set("scope", this.scope);
|
|
56
|
-
authUrl.searchParams.set("code_challenge", codeChallenge);
|
|
57
|
-
authUrl.searchParams.set("code_challenge_method", "S256");
|
|
58
|
-
authUrl.searchParams.set("state", Math.random().toString(36));
|
|
59
|
-
|
|
60
|
-
console.log(`\nš Please visit this URL to authorize the application:`);
|
|
61
|
-
console.log(`${authUrl.toString()}\n`);
|
|
62
|
-
|
|
63
|
-
// Start simple HTTP server to catch the callback
|
|
64
|
-
const code = await this.startCallbackServer();
|
|
65
|
-
|
|
66
|
-
// Exchange code for token
|
|
67
|
-
console.log("š Exchanging authorization code for API key...");
|
|
68
|
-
|
|
69
|
-
const response = await fetch("https://platform.parallel.ai/getKeys/token", {
|
|
70
|
-
method: "POST",
|
|
71
|
-
headers: { "Content-Type": "application/x-www-form-urlencoded" },
|
|
72
|
-
body: new URLSearchParams({
|
|
73
|
-
grant_type: "authorization_code",
|
|
74
|
-
code: code,
|
|
75
|
-
client_id: this.clientId,
|
|
76
|
-
redirect_uri: this.redirectUri,
|
|
77
|
-
code_verifier: codeVerifier,
|
|
78
|
-
}),
|
|
79
|
-
});
|
|
80
|
-
|
|
81
|
-
if (!response.ok) {
|
|
82
|
-
throw new Error(
|
|
83
|
-
`Token exchange failed: ${response.status} ${response.statusText}`
|
|
84
|
-
);
|
|
85
|
-
}
|
|
86
|
-
|
|
87
|
-
const { access_token } = await response.json();
|
|
88
|
-
console.log("ā
Successfully obtained API key!");
|
|
89
|
-
|
|
90
|
-
return access_token;
|
|
91
|
-
}
|
|
92
|
-
|
|
93
|
-
private async generatePKCE(): Promise<{
|
|
94
|
-
codeVerifier: string;
|
|
95
|
-
codeChallenge: string;
|
|
96
|
-
}> {
|
|
97
|
-
const codeVerifier = btoa(
|
|
98
|
-
String.fromCharCode(...crypto.getRandomValues(new Uint8Array(32)))
|
|
99
|
-
).replace(/[+/=]/g, (m) => ({ "+": "-", "/": "_", "=": "" }[m]));
|
|
100
|
-
|
|
101
|
-
const hash = await crypto.subtle.digest(
|
|
102
|
-
"SHA-256",
|
|
103
|
-
new TextEncoder().encode(codeVerifier)
|
|
104
|
-
);
|
|
105
|
-
const codeChallenge = btoa(
|
|
106
|
-
String.fromCharCode(...new Uint8Array(hash))
|
|
107
|
-
).replace(/[+/=]/g, (m) => ({ "+": "-", "/": "_", "=": "" }[m]));
|
|
108
|
-
|
|
109
|
-
return { codeVerifier, codeChallenge };
|
|
110
|
-
}
|
|
111
|
-
|
|
112
|
-
private async startCallbackServer(): Promise<string> {
|
|
113
|
-
return new Promise((resolve, reject) => {
|
|
114
|
-
const server = Bun.serve({
|
|
115
|
-
port: 3737,
|
|
116
|
-
fetch(req) {
|
|
117
|
-
const url = new URL(req.url);
|
|
118
|
-
|
|
119
|
-
if (url.pathname === "/callback") {
|
|
120
|
-
const code = url.searchParams.get("code");
|
|
121
|
-
const error = url.searchParams.get("error");
|
|
122
|
-
|
|
123
|
-
if (error) {
|
|
124
|
-
reject(new Error(`OAuth error: ${error}`));
|
|
125
|
-
return new Response(
|
|
126
|
-
"Error occurred. You can close this window.",
|
|
127
|
-
{ status: 400 }
|
|
128
|
-
);
|
|
129
|
-
}
|
|
130
|
-
|
|
131
|
-
if (code) {
|
|
132
|
-
resolve(code);
|
|
133
|
-
server.stop();
|
|
134
|
-
return new Response(
|
|
135
|
-
"ā
Authorization successful! You can close this window and return to the terminal."
|
|
136
|
-
);
|
|
137
|
-
}
|
|
138
|
-
}
|
|
139
|
-
|
|
140
|
-
return new Response("Invalid request", { status: 404 });
|
|
141
|
-
},
|
|
142
|
-
});
|
|
143
|
-
|
|
144
|
-
// Timeout after 5 minutes
|
|
145
|
-
setTimeout(() => {
|
|
146
|
-
server.stop();
|
|
147
|
-
reject(new Error("OAuth flow timed out"));
|
|
148
|
-
}, 300000);
|
|
149
|
-
});
|
|
150
|
-
}
|
|
151
|
-
}
|
|
152
|
-
|
|
153
|
-
async function loadConfig(): Promise<Config> {
|
|
154
|
-
const configPath = resolve("llmtext.json");
|
|
155
|
-
|
|
156
|
-
if (!existsSync(configPath)) {
|
|
157
|
-
console.error(
|
|
158
|
-
"ā llmtext.json not found. Please create a configuration file."
|
|
159
|
-
);
|
|
160
|
-
console.log("\nExample llmtext.json:");
|
|
161
|
-
console.log(
|
|
162
|
-
JSON.stringify(
|
|
163
|
-
{
|
|
164
|
-
outDir: "./docs",
|
|
165
|
-
origins: ["https://docs.example.com"],
|
|
166
|
-
customUrls: [],
|
|
167
|
-
keepOriginalUrls: false,
|
|
168
|
-
forceExtract: false,
|
|
169
|
-
},
|
|
170
|
-
null,
|
|
171
|
-
2
|
|
172
|
-
)
|
|
173
|
-
);
|
|
174
|
-
process.exit(1);
|
|
175
|
-
}
|
|
176
|
-
|
|
177
|
-
try {
|
|
178
|
-
const config = JSON.parse(readFileSync(configPath, "utf8")) as Config;
|
|
179
|
-
|
|
180
|
-
// Validate required fields
|
|
181
|
-
if (!config.outDir) throw new Error("outDir is required");
|
|
182
|
-
if (!Array.isArray(config.origins))
|
|
183
|
-
throw new Error("origins must be an array");
|
|
184
|
-
|
|
185
|
-
// Set defaults
|
|
186
|
-
config.customUrls = config.customUrls || [];
|
|
187
|
-
config.keepOriginalUrls = config.keepOriginalUrls ?? false;
|
|
188
|
-
config.forceExtract = config.forceExtract ?? false;
|
|
189
|
-
|
|
190
|
-
return config;
|
|
191
|
-
} catch (error) {
|
|
192
|
-
console.error("ā Error reading llmtext.json:", error.message);
|
|
193
|
-
process.exit(1);
|
|
194
|
-
}
|
|
195
|
-
}
|
|
196
|
-
|
|
197
|
-
async function getApiKey(): Promise<string> {
|
|
198
|
-
// Check environment variables first
|
|
199
|
-
let apiKey = process.env.PARALLEL_API_KEY;
|
|
200
|
-
|
|
201
|
-
if (!apiKey && existsSync(".env")) {
|
|
202
|
-
// Try to load from .env file
|
|
203
|
-
const envContent = readFileSync(".env", "utf8");
|
|
204
|
-
const match = envContent.match(/^PARALLEL_API_KEY=(.+)$/m);
|
|
205
|
-
if (match) {
|
|
206
|
-
apiKey = match[1].trim();
|
|
207
|
-
}
|
|
208
|
-
}
|
|
209
|
-
|
|
210
|
-
if (!apiKey) {
|
|
211
|
-
console.log("š No API key found in environment or .env file.");
|
|
212
|
-
const oauth = new OAuth();
|
|
213
|
-
apiKey = await oauth.getApiKey();
|
|
214
|
-
}
|
|
215
|
-
|
|
216
|
-
return apiKey;
|
|
217
|
-
}
|
|
218
|
-
|
|
219
|
-
function loadManifest(outDir: string): Manifest {
|
|
220
|
-
const manifestPath = join(outDir, "llmtext-manifest.json");
|
|
221
|
-
|
|
222
|
-
if (!existsSync(manifestPath)) {
|
|
223
|
-
return { files: [], timestamp: new Date().toISOString() };
|
|
224
|
-
}
|
|
225
|
-
|
|
226
|
-
try {
|
|
227
|
-
return JSON.parse(readFileSync(manifestPath, "utf8"));
|
|
228
|
-
} catch {
|
|
229
|
-
return { files: [], timestamp: new Date().toISOString() };
|
|
230
|
-
}
|
|
231
|
-
}
|
|
232
|
-
|
|
233
|
-
function saveManifest(outDir: string, manifest: Manifest): void {
|
|
234
|
-
const manifestPath = join(outDir, "llmtext-manifest.json");
|
|
235
|
-
writeFileSync(manifestPath, JSON.stringify(manifest, null, 2));
|
|
236
|
-
}
|
|
237
|
-
|
|
238
|
-
function cleanupOldFiles(
|
|
239
|
-
outDir: string,
|
|
240
|
-
currentFiles: string[],
|
|
241
|
-
previousFiles: string[]
|
|
242
|
-
): void {
|
|
243
|
-
const filesToRemove = previousFiles.filter(
|
|
244
|
-
(file) => !currentFiles.includes(file)
|
|
245
|
-
);
|
|
246
|
-
|
|
247
|
-
for (const file of filesToRemove) {
|
|
248
|
-
const filePath = join(outDir, file);
|
|
249
|
-
try {
|
|
250
|
-
if (existsSync(filePath)) {
|
|
251
|
-
rmSync(filePath);
|
|
252
|
-
console.log(`šļø Removed old file: ${file}`);
|
|
253
|
-
}
|
|
254
|
-
} catch (error) {
|
|
255
|
-
console.warn(`ā ļø Could not remove ${file}:`, error.message);
|
|
256
|
-
}
|
|
257
|
-
}
|
|
258
|
-
}
|
|
259
|
-
|
|
260
|
-
async function processCustomUrls(
|
|
261
|
-
customUrls: Array<{ title: string; description: string; url: string }>,
|
|
262
|
-
apiKey: string,
|
|
263
|
-
forceExtract: boolean
|
|
264
|
-
): Promise<Record<string, any>> {
|
|
265
|
-
const files: Record<string, any> = {};
|
|
266
|
-
|
|
267
|
-
for (const customUrl of customUrls) {
|
|
268
|
-
console.log(`š Processing custom URL: ${customUrl.url}`);
|
|
269
|
-
|
|
270
|
-
try {
|
|
271
|
-
// For custom URLs, we need to extract them individually
|
|
272
|
-
const response = await fetch("https://api.parallel.ai/v1beta/extract", {
|
|
273
|
-
method: "POST",
|
|
274
|
-
headers: {
|
|
275
|
-
"Content-Type": "application/json",
|
|
276
|
-
"parallel-beta": "search-extract-2025-10-10",
|
|
277
|
-
"x-api-key": apiKey,
|
|
278
|
-
},
|
|
279
|
-
body: JSON.stringify({
|
|
280
|
-
urls: [customUrl.url],
|
|
281
|
-
full_content: true,
|
|
282
|
-
}),
|
|
283
|
-
});
|
|
284
|
-
|
|
285
|
-
if (response.ok) {
|
|
286
|
-
const result = await response.json();
|
|
287
|
-
if (result.results && result.results.length > 0) {
|
|
288
|
-
const extracted = result.results[0];
|
|
289
|
-
const filename =
|
|
290
|
-
customUrl.title.replace(/[^a-zA-Z0-9]/g, "_").toLowerCase() + ".md";
|
|
291
|
-
|
|
292
|
-
files[filename] = {
|
|
293
|
-
content: extracted.full_content || "",
|
|
294
|
-
title: customUrl.title,
|
|
295
|
-
description: customUrl.description,
|
|
296
|
-
extracted: true,
|
|
297
|
-
publishedDate: extracted.published_date || "",
|
|
298
|
-
status: 200,
|
|
299
|
-
tokens: Math.round((extracted.full_content || "").length / 5),
|
|
300
|
-
};
|
|
301
|
-
}
|
|
302
|
-
}
|
|
303
|
-
} catch (error) {
|
|
304
|
-
console.error(
|
|
305
|
-
`ā Error processing custom URL ${customUrl.url}:`,
|
|
306
|
-
error.message
|
|
307
|
-
);
|
|
308
|
-
}
|
|
309
|
-
}
|
|
310
|
-
|
|
311
|
-
return files;
|
|
312
|
-
}
|
|
313
|
-
|
|
314
|
-
async function main() {
|
|
315
|
-
console.log("š Extract from Sitemap CLI");
|
|
316
|
-
|
|
317
|
-
try {
|
|
318
|
-
const config = await loadConfig();
|
|
319
|
-
const apiKey = await getApiKey();
|
|
320
|
-
|
|
321
|
-
// Ensure output directory exists
|
|
322
|
-
mkdirSync(config.outDir, { recursive: true });
|
|
323
|
-
|
|
324
|
-
// Load previous manifest
|
|
325
|
-
const previousManifest = loadManifest(config.outDir);
|
|
326
|
-
const currentFiles: string[] = [];
|
|
327
|
-
|
|
328
|
-
let totalTokens = 0;
|
|
329
|
-
let totalPages = 0;
|
|
330
|
-
let totalErrors = 0;
|
|
331
|
-
|
|
332
|
-
// Process each origin
|
|
333
|
-
for (const origin of config.origins) {
|
|
334
|
-
console.log(`\nš Processing origin: ${origin}`);
|
|
335
|
-
|
|
336
|
-
try {
|
|
337
|
-
const result = await extractFromSitemap(
|
|
338
|
-
origin,
|
|
339
|
-
config.forceExtract,
|
|
340
|
-
apiKey
|
|
341
|
-
);
|
|
342
|
-
|
|
343
|
-
console.log(
|
|
344
|
-
`ā
Extracted ${result.totalPages} pages with ${result.totalTokens} tokens`
|
|
345
|
-
);
|
|
346
|
-
if (result.errors > 0) {
|
|
347
|
-
console.log(`ā ļø ${result.errors} errors occurred`);
|
|
348
|
-
}
|
|
349
|
-
|
|
350
|
-
// Write files to disk
|
|
351
|
-
for (const [path, file] of Object.entries(result.files)) {
|
|
352
|
-
let filename = path;
|
|
353
|
-
|
|
354
|
-
if (!config.keepOriginalUrls) {
|
|
355
|
-
// Create domain-specific subdirectory
|
|
356
|
-
const domain = new URL(
|
|
357
|
-
origin.startsWith("http") ? origin : `https://${origin}`
|
|
358
|
-
).hostname;
|
|
359
|
-
const domainDir = join(config.outDir, domain);
|
|
360
|
-
mkdirSync(domainDir, { recursive: true });
|
|
361
|
-
filename = join(
|
|
362
|
-
domain,
|
|
363
|
-
path.startsWith("/") ? path.slice(1) : path
|
|
364
|
-
);
|
|
365
|
-
} else {
|
|
366
|
-
filename = path.startsWith("/") ? path.slice(1) : path;
|
|
367
|
-
}
|
|
368
|
-
|
|
369
|
-
const filePath = join(config.outDir, filename);
|
|
370
|
-
const fileDir = dirname(filePath);
|
|
371
|
-
|
|
372
|
-
mkdirSync(fileDir, { recursive: true });
|
|
373
|
-
writeFileSync(filePath, file.content);
|
|
374
|
-
currentFiles.push(filename);
|
|
375
|
-
|
|
376
|
-
console.log(`š Wrote: ${filename} (${file.tokens} tokens)`);
|
|
377
|
-
}
|
|
378
|
-
|
|
379
|
-
totalTokens += result.totalTokens;
|
|
380
|
-
totalPages += result.totalPages;
|
|
381
|
-
totalErrors += result.errors;
|
|
382
|
-
} catch (error) {
|
|
383
|
-
console.error(`ā Error processing ${origin}:`, error.message);
|
|
384
|
-
totalErrors++;
|
|
385
|
-
}
|
|
386
|
-
}
|
|
387
|
-
|
|
388
|
-
// Process custom URLs
|
|
389
|
-
if (config.customUrls.length > 0) {
|
|
390
|
-
console.log(`\nš Processing ${config.customUrls.length} custom URLs...`);
|
|
391
|
-
const customFiles = await processCustomUrls(
|
|
392
|
-
config.customUrls,
|
|
393
|
-
apiKey,
|
|
394
|
-
config.forceExtract
|
|
395
|
-
);
|
|
396
|
-
|
|
397
|
-
for (const [filename, file] of Object.entries(customFiles)) {
|
|
398
|
-
const filePath = join(config.outDir, filename);
|
|
399
|
-
writeFileSync(filePath, file.content);
|
|
400
|
-
currentFiles.push(filename);
|
|
401
|
-
totalTokens += file.tokens;
|
|
402
|
-
totalPages++;
|
|
403
|
-
|
|
404
|
-
console.log(`š Wrote: ${filename} (${file.tokens} tokens)`);
|
|
405
|
-
}
|
|
406
|
-
}
|
|
407
|
-
|
|
408
|
-
// Clean up old files
|
|
409
|
-
if (previousManifest.files.length > 0) {
|
|
410
|
-
cleanupOldFiles(config.outDir, currentFiles, previousManifest.files);
|
|
411
|
-
}
|
|
412
|
-
|
|
413
|
-
// Save new manifest
|
|
414
|
-
const newManifest: Manifest = {
|
|
415
|
-
files: currentFiles,
|
|
416
|
-
timestamp: new Date().toISOString(),
|
|
417
|
-
};
|
|
418
|
-
saveManifest(config.outDir, newManifest);
|
|
419
|
-
|
|
420
|
-
console.log(`\n⨠Extraction completed!`);
|
|
421
|
-
console.log(`š Total: ${totalPages} pages, ${totalTokens} tokens`);
|
|
422
|
-
if (totalErrors > 0) {
|
|
423
|
-
console.log(`ā ļø Errors: ${totalErrors}`);
|
|
424
|
-
}
|
|
425
|
-
console.log(`š Output directory: ${resolve(config.outDir)}`);
|
|
426
|
-
} catch (error) {
|
|
427
|
-
console.error("š„ Fatal error:", error.message);
|
|
428
|
-
process.exit(1);
|
|
429
|
-
}
|
|
430
|
-
}
|
|
431
|
-
|
|
432
|
-
if (import.meta.main) {
|
|
433
|
-
main();
|
|
434
|
-
}
|