extract-from-sitemap 0.0.1 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. package/cli.js +602 -0
  2. package/package.json +4 -5
  3. package/cli.ts +0 -434
package/cli.js ADDED
@@ -0,0 +1,602 @@
1
+ #!/usr/bin/env node
2
+
3
+ const fs = require("fs");
4
+ const path = require("path");
5
+ const { spawn } = require("child_process");
6
+ const crypto = require("crypto");
7
+ const http = require("http");
8
+ const { URL, URLSearchParams } = require("url");
9
+ const os = require("os");
10
+
11
+ /**
12
+ * @typedef {Object} Config
13
+ * @property {string} outDir - Output directory for extracted files
14
+ * @property {string[]} origins - Array of origin URLs to process
15
+ * @property {Array<{title: string, description: string, url: string}>} customUrls - Custom URLs to extract
16
+ * @property {boolean} keepOriginalUrls - Whether to keep original URL structure
17
+ * @property {boolean} forceExtract - Whether to force extraction even if files exist
18
+ */
19
+
20
+ /**
21
+ * @typedef {Object} Manifest
22
+ * @property {string[]} files - List of generated files
23
+ * @property {string} timestamp - Timestamp of last generation
24
+ */
25
+
26
+ const CREDENTIALS_DIR = path.join(os.homedir(), ".llmtext");
27
+ const API_KEY_FILE = path.join(CREDENTIALS_DIR, "api-key");
28
+
29
+ /**
30
+ * OAuth handler for Parallel.ai API key authentication
31
+ */
32
+ class OAuth {
33
+ constructor() {
34
+ this.clientId = "extract-from-sitemap-cli";
35
+ this.redirectUri = "http://localhost:3737/callback";
36
+ this.scope = "key:read";
37
+ this.server = null;
38
+ }
39
+
40
+ /**
41
+ * Get API key through OAuth flow
42
+ * @returns {Promise<string>} The API key
43
+ */
44
+ async getApiKey() {
45
+ console.log("šŸ” Starting OAuth flow...");
46
+
47
+ // Generate PKCE parameters
48
+ const { codeVerifier, codeChallenge } = await this.generatePKCE();
49
+
50
+ // Build authorization URL
51
+ const authUrl = new URL("https://platform.parallel.ai/getKeys/authorize");
52
+ authUrl.searchParams.set("client_id", this.clientId);
53
+ authUrl.searchParams.set("redirect_uri", this.redirectUri);
54
+ authUrl.searchParams.set("response_type", "code");
55
+ authUrl.searchParams.set("scope", this.scope);
56
+ authUrl.searchParams.set("code_challenge", codeChallenge);
57
+ authUrl.searchParams.set("code_challenge_method", "S256");
58
+ authUrl.searchParams.set("state", Math.random().toString(36));
59
+
60
+ console.log("\nšŸ“– Opening browser for authorization...");
61
+
62
+ // Open browser automatically
63
+ await this.openBrowser(authUrl.toString());
64
+
65
+ // Start simple HTTP server to catch the callback
66
+ const code = await this.startCallbackServer();
67
+
68
+ // Exchange code for token
69
+ console.log("šŸ”„ Exchanging authorization code for API key...");
70
+
71
+ const response = await fetch("https://platform.parallel.ai/getKeys/token", {
72
+ method: "POST",
73
+ headers: { "Content-Type": "application/x-www-form-urlencoded" },
74
+ body: new URLSearchParams({
75
+ grant_type: "authorization_code",
76
+ code: code,
77
+ client_id: this.clientId,
78
+ redirect_uri: this.redirectUri,
79
+ code_verifier: codeVerifier,
80
+ }),
81
+ });
82
+
83
+ if (!response.ok) {
84
+ throw new Error(
85
+ `Token exchange failed: ${response.status} ${response.statusText}`
86
+ );
87
+ }
88
+
89
+ const { access_token } = await response.json();
90
+ console.log("āœ… Successfully obtained API key!");
91
+
92
+ return access_token;
93
+ }
94
+
95
+ /**
96
+ * Open browser to authorization URL
97
+ * @param {string} url - The authorization URL
98
+ */
99
+ async openBrowser(url) {
100
+ try {
101
+ const platform = process.platform;
102
+ let command, args;
103
+
104
+ if (platform === "darwin") {
105
+ command = "open";
106
+ args = [url];
107
+ } else if (platform === "win32") {
108
+ command = "start";
109
+ args = ["", url];
110
+ } else {
111
+ // Linux/Unix
112
+ command = "xdg-open";
113
+ args = [url];
114
+ }
115
+
116
+ spawn(command, args, { detached: true, stdio: "ignore" });
117
+ } catch (error) {
118
+ console.log("\nšŸ“– Please visit this URL to authorize the application:");
119
+ console.log(`${url}\n`);
120
+ }
121
+ }
122
+
123
+ /**
124
+ * Generate PKCE code verifier and challenge
125
+ * @returns {Promise<{codeVerifier: string, codeChallenge: string}>}
126
+ */
127
+ async generatePKCE() {
128
+ const codeVerifier = crypto.randomBytes(32).toString("base64url");
129
+ const hash = crypto
130
+ .createHash("sha256")
131
+ .update(codeVerifier)
132
+ .digest("base64url");
133
+
134
+ return {
135
+ codeVerifier,
136
+ codeChallenge: hash,
137
+ };
138
+ }
139
+
140
+ /**
141
+ * Start HTTP server to catch OAuth callback
142
+ * @returns {Promise<string>} The authorization code
143
+ */
144
+ async startCallbackServer() {
145
+ return new Promise((resolve, reject) => {
146
+ this.server = http.createServer((req, res) => {
147
+ const url = new URL(req.url, `http://${req.headers.host}`);
148
+
149
+ if (url.pathname === "/callback") {
150
+ const code = url.searchParams.get("code");
151
+ const error = url.searchParams.get("error");
152
+
153
+ if (error) {
154
+ reject(new Error(`OAuth error: ${error}`));
155
+ res.writeHead(400);
156
+ res.end("Error occurred. You can close this window.");
157
+ return;
158
+ }
159
+
160
+ if (code) {
161
+ resolve(code);
162
+ res.writeHead(200, { "Content-Type": "text/html" });
163
+ res.end(
164
+ "āœ… Authorization successful! You can close this window and return to the terminal."
165
+ );
166
+ return;
167
+ }
168
+ }
169
+
170
+ res.writeHead(404);
171
+ res.end("Invalid request");
172
+ });
173
+
174
+ this.server.listen(3737);
175
+
176
+ // Timeout after 5 minutes
177
+ setTimeout(() => {
178
+ this.stopServer();
179
+ reject(new Error("OAuth flow timed out"));
180
+ }, 300000);
181
+ }).finally(() => {
182
+ this.stopServer();
183
+ });
184
+ }
185
+
186
+ /**
187
+ * Stop the callback server
188
+ */
189
+ stopServer() {
190
+ if (this.server) {
191
+ this.server.close();
192
+ this.server = null;
193
+ }
194
+ }
195
+ }
196
+
197
+ /**
198
+ * Load configuration from llmtext.json
199
+ * @returns {Promise<Config>} The configuration object
200
+ */
201
+ async function loadConfig() {
202
+ const configPath = path.resolve("llmtext.json");
203
+
204
+ if (!fs.existsSync(configPath)) {
205
+ console.error(
206
+ "āŒ llmtext.json not found. Please create a configuration file."
207
+ );
208
+ console.log("\nExample llmtext.json:");
209
+ console.log(
210
+ JSON.stringify(
211
+ {
212
+ outDir: "./docs",
213
+ origins: ["https://docs.example.com"],
214
+ customUrls: [],
215
+ keepOriginalUrls: false,
216
+ forceExtract: false,
217
+ },
218
+ null,
219
+ 2
220
+ )
221
+ );
222
+ process.exit(1);
223
+ }
224
+
225
+ try {
226
+ const config = JSON.parse(fs.readFileSync(configPath, "utf8"));
227
+
228
+ // Validate required fields
229
+ if (!config.outDir) throw new Error("outDir is required");
230
+ if (!Array.isArray(config.origins))
231
+ throw new Error("origins must be an array");
232
+
233
+ // Set defaults
234
+ config.customUrls = config.customUrls || [];
235
+ config.keepOriginalUrls = config.keepOriginalUrls ?? false;
236
+ config.forceExtract = config.forceExtract ?? false;
237
+
238
+ return config;
239
+ } catch (error) {
240
+ console.error("āŒ Error reading llmtext.json:", error.message);
241
+ process.exit(1);
242
+ }
243
+ }
244
+
245
+ /**
246
+ * Store API key in ~/.llmtext/api-key
247
+ * @param {string} apiKey - The API key to store
248
+ */
249
+ function storeApiKey(apiKey) {
250
+ try {
251
+ fs.mkdirSync(CREDENTIALS_DIR, { recursive: true });
252
+ fs.writeFileSync(API_KEY_FILE, apiKey, { mode: 0o600 }); // Only owner can read
253
+ console.log("šŸ’¾ API key stored securely in ~/.llmtext/api-key");
254
+ } catch (error) {
255
+ console.warn("āš ļø Could not store API key:", error.message);
256
+ }
257
+ }
258
+
259
+ /**
260
+ * Load API key from ~/.llmtext/api-key
261
+ * @returns {string|null} The stored API key or null if not found
262
+ */
263
+ function loadStoredApiKey() {
264
+ try {
265
+ if (fs.existsSync(API_KEY_FILE)) {
266
+ const apiKey = fs.readFileSync(API_KEY_FILE, "utf8").trim();
267
+ if (apiKey) {
268
+ console.log("šŸ”‘ Using stored API key from ~/.llmtext/api-key");
269
+ return apiKey;
270
+ }
271
+ }
272
+ } catch (error) {
273
+ console.warn("āš ļø Could not read stored API key:", error.message);
274
+ }
275
+ return null;
276
+ }
277
+
278
+ /**
279
+ * Get API key from various sources or start OAuth flow
280
+ * @returns {Promise<string>} The API key
281
+ */
282
+ async function getApiKey() {
283
+ // Check stored API key first
284
+ const storedKey = loadStoredApiKey();
285
+ if (storedKey) {
286
+ return storedKey;
287
+ }
288
+
289
+ // Check environment variables
290
+ let apiKey = process.env.PARALLEL_API_KEY;
291
+
292
+ if (!apiKey && fs.existsSync(".env")) {
293
+ // Try to load from .env file
294
+ const envContent = fs.readFileSync(".env", "utf8");
295
+ const match = envContent.match(/^PARALLEL_API_KEY=(.+)$/m);
296
+ if (match) {
297
+ apiKey = match[1].trim();
298
+ }
299
+ }
300
+
301
+ if (apiKey) {
302
+ console.log("šŸ”‘ Using API key from environment");
303
+ storeApiKey(apiKey);
304
+ return apiKey;
305
+ }
306
+
307
+ // No API key found, start OAuth flow
308
+ console.log("šŸ”‘ No API key found. Starting OAuth flow...");
309
+ const oauth = new OAuth();
310
+ const newApiKey = await oauth.getApiKey();
311
+
312
+ storeApiKey(newApiKey);
313
+ return newApiKey;
314
+ }
315
+
316
+ /**
317
+ * Load manifest file
318
+ * @param {string} outDir - Output directory
319
+ * @returns {Manifest} The manifest object
320
+ */
321
+ function loadManifest(outDir) {
322
+ const manifestPath = path.join(outDir, "llmtext-manifest.json");
323
+
324
+ if (!fs.existsSync(manifestPath)) {
325
+ return { files: [], timestamp: new Date().toISOString() };
326
+ }
327
+
328
+ try {
329
+ return JSON.parse(fs.readFileSync(manifestPath, "utf8"));
330
+ } catch {
331
+ return { files: [], timestamp: new Date().toISOString() };
332
+ }
333
+ }
334
+
335
+ /**
336
+ * Save manifest file
337
+ * @param {string} outDir - Output directory
338
+ * @param {Manifest} manifest - The manifest to save
339
+ */
340
+ function saveManifest(outDir, manifest) {
341
+ const manifestPath = path.join(outDir, "llmtext-manifest.json");
342
+ fs.writeFileSync(manifestPath, JSON.stringify(manifest, null, 2));
343
+ }
344
+
345
+ /**
346
+ * Clean up old files that are no longer generated
347
+ * @param {string} outDir - Output directory
348
+ * @param {string[]} currentFiles - Currently generated files
349
+ * @param {string[]} previousFiles - Previously generated files
350
+ */
351
+ function cleanupOldFiles(outDir, currentFiles, previousFiles) {
352
+ const filesToRemove = previousFiles.filter(
353
+ (file) => !currentFiles.includes(file)
354
+ );
355
+
356
+ for (const file of filesToRemove) {
357
+ const filePath = path.join(outDir, file);
358
+ try {
359
+ if (fs.existsSync(filePath)) {
360
+ fs.rmSync(filePath);
361
+ console.log(`šŸ—‘ļø Removed old file: ${file}`);
362
+ }
363
+ } catch (error) {
364
+ console.warn(`āš ļø Could not remove ${file}:`, error.message);
365
+ }
366
+ }
367
+ }
368
+
369
+ /**
370
+ * Process custom URLs through extraction API
371
+ * @param {Array<{title: string, description: string, url: string}>} customUrls - Custom URLs to process
372
+ * @param {string} apiKey - API key for authentication
373
+ * @param {boolean} forceExtract - Whether to force extraction
374
+ * @returns {Promise<Record<string, any>>} Extracted files
375
+ */
376
+ async function processCustomUrls(customUrls, apiKey, forceExtract) {
377
+ const files = {};
378
+
379
+ for (const customUrl of customUrls) {
380
+ console.log(`šŸ“„ Processing custom URL: ${customUrl.url}`);
381
+
382
+ try {
383
+ const response = await fetch("https://api.parallel.ai/v1beta/extract", {
384
+ method: "POST",
385
+ headers: {
386
+ "Content-Type": "application/json",
387
+ "parallel-beta": "search-extract-2025-10-10",
388
+ "x-api-key": apiKey,
389
+ },
390
+ body: JSON.stringify({
391
+ urls: [customUrl.url],
392
+ full_content: true,
393
+ }),
394
+ });
395
+
396
+ if (response.ok) {
397
+ const result = await response.json();
398
+ if (result.results && result.results.length > 0) {
399
+ const extracted = result.results[0];
400
+ const filename =
401
+ customUrl.title.replace(/[^a-zA-Z0-9]/g, "_").toLowerCase() + ".md";
402
+
403
+ files[filename] = {
404
+ content: extracted.full_content || "",
405
+ title: customUrl.title,
406
+ description: customUrl.description,
407
+ extracted: true,
408
+ publishedDate: extracted.published_date || "",
409
+ status: 200,
410
+ tokens: Math.round((extracted.full_content || "").length / 5),
411
+ };
412
+ }
413
+ }
414
+ } catch (error) {
415
+ console.error(
416
+ `āŒ Error processing custom URL ${customUrl.url}:`,
417
+ error.message
418
+ );
419
+ }
420
+ }
421
+
422
+ return files;
423
+ }
424
+
425
+ /**
426
+ * Clear stored API key credentials
427
+ */
428
+ async function clearCredentials() {
429
+ try {
430
+ if (fs.existsSync(API_KEY_FILE)) {
431
+ fs.unlinkSync(API_KEY_FILE);
432
+ console.log("āœ… Cleared stored API key from ~/.llmtext/api-key");
433
+ } else {
434
+ console.log("ā„¹ļø No stored API key found to clear");
435
+ }
436
+ } catch (error) {
437
+ console.error("āŒ Error clearing credentials:", error.message);
438
+ }
439
+ }
440
+
441
+ /**
442
+ * Extract content from sitemap (placeholder - you'll need to implement this)
443
+ * @param {string} origin - The origin URL
444
+ * @param {boolean} forceExtract - Whether to force extraction
445
+ * @param {string} apiKey - API key for authentication
446
+ * @returns {Promise<{totalPages: number, totalTokens: number, errors: number, files: Record<string, any>}>}
447
+ */
448
+ async function extractFromSitemap(origin, forceExtract, apiKey) {
449
+ // This is a placeholder - you'll need to implement the actual extraction logic
450
+ // or import it from your mod.js file
451
+ console.log(`Extracting from ${origin} (force: ${forceExtract})`);
452
+
453
+ // For now, return empty result
454
+ return {
455
+ totalPages: 0,
456
+ totalTokens: 0,
457
+ errors: 0,
458
+ files: {},
459
+ };
460
+ }
461
+
462
+ /**
463
+ * Main function
464
+ */
465
+ async function main() {
466
+ console.log("šŸš€ Extract from Sitemap CLI");
467
+
468
+ // Check for special commands
469
+ const args = process.argv.slice(2);
470
+ if (args.includes("--clear-credentials")) {
471
+ await clearCredentials();
472
+ return;
473
+ }
474
+
475
+ try {
476
+ const config = await loadConfig();
477
+ const apiKey = await getApiKey();
478
+
479
+ // Ensure output directory exists
480
+ fs.mkdirSync(config.outDir, { recursive: true });
481
+
482
+ // Load previous manifest
483
+ const previousManifest = loadManifest(config.outDir);
484
+ const currentFiles = [];
485
+
486
+ let totalTokens = 0;
487
+ let totalPages = 0;
488
+ let totalErrors = 0;
489
+
490
+ // Process each origin
491
+ for (const origin of config.origins) {
492
+ console.log(`\n🌐 Processing origin: ${origin}`);
493
+
494
+ try {
495
+ const result = await extractFromSitemap(
496
+ origin,
497
+ config.forceExtract,
498
+ apiKey
499
+ );
500
+
501
+ console.log(
502
+ `āœ… Extracted ${result.totalPages} pages with ${result.totalTokens} tokens`
503
+ );
504
+ if (result.errors > 0) {
505
+ console.log(`āš ļø ${result.errors} errors occurred`);
506
+ }
507
+
508
+ // Write files to disk
509
+ for (const [filePath, file] of Object.entries(result.files)) {
510
+ let filename = filePath;
511
+
512
+ if (!config.keepOriginalUrls) {
513
+ // Create domain-specific subdirectory
514
+ const domain = new URL(
515
+ origin.startsWith("http") ? origin : `https://${origin}`
516
+ ).hostname;
517
+ const domainDir = path.join(config.outDir, domain);
518
+ fs.mkdirSync(domainDir, { recursive: true });
519
+ filename = path.join(
520
+ domain,
521
+ filePath.startsWith("/") ? filePath.slice(1) : filePath
522
+ );
523
+ } else {
524
+ filename = filePath.startsWith("/") ? filePath.slice(1) : filePath;
525
+ }
526
+
527
+ const fullFilePath = path.join(config.outDir, filename);
528
+ const fileDir = path.dirname(fullFilePath);
529
+
530
+ fs.mkdirSync(fileDir, { recursive: true });
531
+ fs.writeFileSync(fullFilePath, file.content);
532
+ currentFiles.push(filename);
533
+
534
+ console.log(`šŸ“ Wrote: ${filename} (${file.tokens} tokens)`);
535
+ }
536
+
537
+ totalTokens += result.totalTokens;
538
+ totalPages += result.totalPages;
539
+ totalErrors += result.errors;
540
+ } catch (error) {
541
+ console.error(`āŒ Error processing ${origin}:`, error.message);
542
+ totalErrors++;
543
+ }
544
+ }
545
+
546
+ // Process custom URLs
547
+ if (config.customUrls.length > 0) {
548
+ console.log(`\nšŸ“‹ Processing ${config.customUrls.length} custom URLs...`);
549
+ const customFiles = await processCustomUrls(
550
+ config.customUrls,
551
+ apiKey,
552
+ config.forceExtract
553
+ );
554
+
555
+ for (const [filename, file] of Object.entries(customFiles)) {
556
+ const filePath = path.join(config.outDir, filename);
557
+ fs.writeFileSync(filePath, file.content);
558
+ currentFiles.push(filename);
559
+ totalTokens += file.tokens;
560
+ totalPages++;
561
+
562
+ console.log(`šŸ“ Wrote: ${filename} (${file.tokens} tokens)`);
563
+ }
564
+ }
565
+
566
+ // Clean up old files
567
+ if (previousManifest.files.length > 0) {
568
+ cleanupOldFiles(config.outDir, currentFiles, previousManifest.files);
569
+ }
570
+
571
+ // Save new manifest
572
+ const newManifest = {
573
+ files: currentFiles,
574
+ timestamp: new Date().toISOString(),
575
+ };
576
+ saveManifest(config.outDir, newManifest);
577
+
578
+ console.log("\n✨ Extraction completed!");
579
+ console.log(`šŸ“Š Total: ${totalPages} pages, ${totalTokens} tokens`);
580
+ if (totalErrors > 0) {
581
+ console.log(`āš ļø Errors: ${totalErrors}`);
582
+ }
583
+ console.log(`šŸ“ Output directory: ${path.resolve(config.outDir)}`);
584
+ console.log("\nšŸ’” Use --clear-credentials to remove stored API key");
585
+ } catch (error) {
586
+ console.error("šŸ’„ Fatal error:", error.message);
587
+ process.exit(1);
588
+ }
589
+ }
590
+
591
+ // Run main function if this file is executed directly
592
+ if (require.main === module) {
593
+ main();
594
+ }
595
+
596
+ module.exports = {
597
+ OAuth,
598
+ loadConfig,
599
+ getApiKey,
600
+ clearCredentials,
601
+ main,
602
+ };
package/package.json CHANGED
@@ -1,16 +1,15 @@
1
1
  {
2
2
  "name": "extract-from-sitemap",
3
- "bin": "cli.ts",
4
- "version": "0.0.1",
3
+ "bin": "cli.js",
4
+ "version": "0.0.3",
5
5
  "main": "mod.js",
6
6
  "description": "A module and CLI that allows extracting all pages from a sitemap into markdown and a llms.txt, using Parallel.ai APIs.",
7
7
  "files": [
8
8
  "mod.js",
9
- "cli.ts"
9
+ "cli.js"
10
10
  ],
11
11
  "license": "MIT",
12
12
  "devDependencies": {
13
- "@cloudflare/workers-types": "4.20251011.0",
14
- "@types/bun": "1.3.0"
13
+ "@cloudflare/workers-types": "4.20251011.0"
15
14
  }
16
15
  }
package/cli.ts DELETED
@@ -1,434 +0,0 @@
1
- #!/usr/bin/env bun
2
- /// <reference types="@types/bun" />
3
- /// <reference lib="esnext" />
4
-
5
- import {
6
- existsSync,
7
- readFileSync,
8
- writeFileSync,
9
- mkdirSync,
10
- rmSync,
11
- readdirSync,
12
- } from "fs";
13
- import { join, dirname, resolve } from "path";
14
- import { extractFromSitemap } from "./mod.js";
15
-
16
- interface Config {
17
- outDir: string;
18
- origins: string[];
19
- customUrls: Array<{
20
- title: string;
21
- description: string;
22
- url: string;
23
- }>;
24
- keepOriginalUrls: boolean;
25
- forceExtract: boolean;
26
- }
27
-
28
- interface Manifest {
29
- files: string[];
30
- timestamp: string;
31
- }
32
-
33
- class OAuth {
34
- private clientId: string;
35
- private redirectUri: string;
36
- private scope: string;
37
-
38
- constructor() {
39
- this.clientId = "extract-from-sitemap-cli";
40
- this.redirectUri = "http://localhost:3737/callback";
41
- this.scope = "key:read";
42
- }
43
-
44
- async getApiKey(): Promise<string> {
45
- console.log("šŸ” Starting OAuth flow...");
46
-
47
- // Generate PKCE parameters
48
- const { codeVerifier, codeChallenge } = await this.generatePKCE();
49
-
50
- // Build authorization URL
51
- const authUrl = new URL("https://platform.parallel.ai/getKeys/authorize");
52
- authUrl.searchParams.set("client_id", this.clientId);
53
- authUrl.searchParams.set("redirect_uri", this.redirectUri);
54
- authUrl.searchParams.set("response_type", "code");
55
- authUrl.searchParams.set("scope", this.scope);
56
- authUrl.searchParams.set("code_challenge", codeChallenge);
57
- authUrl.searchParams.set("code_challenge_method", "S256");
58
- authUrl.searchParams.set("state", Math.random().toString(36));
59
-
60
- console.log(`\nšŸ“– Please visit this URL to authorize the application:`);
61
- console.log(`${authUrl.toString()}\n`);
62
-
63
- // Start simple HTTP server to catch the callback
64
- const code = await this.startCallbackServer();
65
-
66
- // Exchange code for token
67
- console.log("šŸ”„ Exchanging authorization code for API key...");
68
-
69
- const response = await fetch("https://platform.parallel.ai/getKeys/token", {
70
- method: "POST",
71
- headers: { "Content-Type": "application/x-www-form-urlencoded" },
72
- body: new URLSearchParams({
73
- grant_type: "authorization_code",
74
- code: code,
75
- client_id: this.clientId,
76
- redirect_uri: this.redirectUri,
77
- code_verifier: codeVerifier,
78
- }),
79
- });
80
-
81
- if (!response.ok) {
82
- throw new Error(
83
- `Token exchange failed: ${response.status} ${response.statusText}`
84
- );
85
- }
86
-
87
- const { access_token } = await response.json();
88
- console.log("āœ… Successfully obtained API key!");
89
-
90
- return access_token;
91
- }
92
-
93
- private async generatePKCE(): Promise<{
94
- codeVerifier: string;
95
- codeChallenge: string;
96
- }> {
97
- const codeVerifier = btoa(
98
- String.fromCharCode(...crypto.getRandomValues(new Uint8Array(32)))
99
- ).replace(/[+/=]/g, (m) => ({ "+": "-", "/": "_", "=": "" }[m]));
100
-
101
- const hash = await crypto.subtle.digest(
102
- "SHA-256",
103
- new TextEncoder().encode(codeVerifier)
104
- );
105
- const codeChallenge = btoa(
106
- String.fromCharCode(...new Uint8Array(hash))
107
- ).replace(/[+/=]/g, (m) => ({ "+": "-", "/": "_", "=": "" }[m]));
108
-
109
- return { codeVerifier, codeChallenge };
110
- }
111
-
112
- private async startCallbackServer(): Promise<string> {
113
- return new Promise((resolve, reject) => {
114
- const server = Bun.serve({
115
- port: 3737,
116
- fetch(req) {
117
- const url = new URL(req.url);
118
-
119
- if (url.pathname === "/callback") {
120
- const code = url.searchParams.get("code");
121
- const error = url.searchParams.get("error");
122
-
123
- if (error) {
124
- reject(new Error(`OAuth error: ${error}`));
125
- return new Response(
126
- "Error occurred. You can close this window.",
127
- { status: 400 }
128
- );
129
- }
130
-
131
- if (code) {
132
- resolve(code);
133
- server.stop();
134
- return new Response(
135
- "āœ… Authorization successful! You can close this window and return to the terminal."
136
- );
137
- }
138
- }
139
-
140
- return new Response("Invalid request", { status: 404 });
141
- },
142
- });
143
-
144
- // Timeout after 5 minutes
145
- setTimeout(() => {
146
- server.stop();
147
- reject(new Error("OAuth flow timed out"));
148
- }, 300000);
149
- });
150
- }
151
- }
152
-
153
- async function loadConfig(): Promise<Config> {
154
- const configPath = resolve("llmtext.json");
155
-
156
- if (!existsSync(configPath)) {
157
- console.error(
158
- "āŒ llmtext.json not found. Please create a configuration file."
159
- );
160
- console.log("\nExample llmtext.json:");
161
- console.log(
162
- JSON.stringify(
163
- {
164
- outDir: "./docs",
165
- origins: ["https://docs.example.com"],
166
- customUrls: [],
167
- keepOriginalUrls: false,
168
- forceExtract: false,
169
- },
170
- null,
171
- 2
172
- )
173
- );
174
- process.exit(1);
175
- }
176
-
177
- try {
178
- const config = JSON.parse(readFileSync(configPath, "utf8")) as Config;
179
-
180
- // Validate required fields
181
- if (!config.outDir) throw new Error("outDir is required");
182
- if (!Array.isArray(config.origins))
183
- throw new Error("origins must be an array");
184
-
185
- // Set defaults
186
- config.customUrls = config.customUrls || [];
187
- config.keepOriginalUrls = config.keepOriginalUrls ?? false;
188
- config.forceExtract = config.forceExtract ?? false;
189
-
190
- return config;
191
- } catch (error) {
192
- console.error("āŒ Error reading llmtext.json:", error.message);
193
- process.exit(1);
194
- }
195
- }
196
-
197
- async function getApiKey(): Promise<string> {
198
- // Check environment variables first
199
- let apiKey = process.env.PARALLEL_API_KEY;
200
-
201
- if (!apiKey && existsSync(".env")) {
202
- // Try to load from .env file
203
- const envContent = readFileSync(".env", "utf8");
204
- const match = envContent.match(/^PARALLEL_API_KEY=(.+)$/m);
205
- if (match) {
206
- apiKey = match[1].trim();
207
- }
208
- }
209
-
210
- if (!apiKey) {
211
- console.log("šŸ”‘ No API key found in environment or .env file.");
212
- const oauth = new OAuth();
213
- apiKey = await oauth.getApiKey();
214
- }
215
-
216
- return apiKey;
217
- }
218
-
219
- function loadManifest(outDir: string): Manifest {
220
- const manifestPath = join(outDir, "llmtext-manifest.json");
221
-
222
- if (!existsSync(manifestPath)) {
223
- return { files: [], timestamp: new Date().toISOString() };
224
- }
225
-
226
- try {
227
- return JSON.parse(readFileSync(manifestPath, "utf8"));
228
- } catch {
229
- return { files: [], timestamp: new Date().toISOString() };
230
- }
231
- }
232
-
233
- function saveManifest(outDir: string, manifest: Manifest): void {
234
- const manifestPath = join(outDir, "llmtext-manifest.json");
235
- writeFileSync(manifestPath, JSON.stringify(manifest, null, 2));
236
- }
237
-
238
- function cleanupOldFiles(
239
- outDir: string,
240
- currentFiles: string[],
241
- previousFiles: string[]
242
- ): void {
243
- const filesToRemove = previousFiles.filter(
244
- (file) => !currentFiles.includes(file)
245
- );
246
-
247
- for (const file of filesToRemove) {
248
- const filePath = join(outDir, file);
249
- try {
250
- if (existsSync(filePath)) {
251
- rmSync(filePath);
252
- console.log(`šŸ—‘ļø Removed old file: ${file}`);
253
- }
254
- } catch (error) {
255
- console.warn(`āš ļø Could not remove ${file}:`, error.message);
256
- }
257
- }
258
- }
259
-
260
- async function processCustomUrls(
261
- customUrls: Array<{ title: string; description: string; url: string }>,
262
- apiKey: string,
263
- forceExtract: boolean
264
- ): Promise<Record<string, any>> {
265
- const files: Record<string, any> = {};
266
-
267
- for (const customUrl of customUrls) {
268
- console.log(`šŸ“„ Processing custom URL: ${customUrl.url}`);
269
-
270
- try {
271
- // For custom URLs, we need to extract them individually
272
- const response = await fetch("https://api.parallel.ai/v1beta/extract", {
273
- method: "POST",
274
- headers: {
275
- "Content-Type": "application/json",
276
- "parallel-beta": "search-extract-2025-10-10",
277
- "x-api-key": apiKey,
278
- },
279
- body: JSON.stringify({
280
- urls: [customUrl.url],
281
- full_content: true,
282
- }),
283
- });
284
-
285
- if (response.ok) {
286
- const result = await response.json();
287
- if (result.results && result.results.length > 0) {
288
- const extracted = result.results[0];
289
- const filename =
290
- customUrl.title.replace(/[^a-zA-Z0-9]/g, "_").toLowerCase() + ".md";
291
-
292
- files[filename] = {
293
- content: extracted.full_content || "",
294
- title: customUrl.title,
295
- description: customUrl.description,
296
- extracted: true,
297
- publishedDate: extracted.published_date || "",
298
- status: 200,
299
- tokens: Math.round((extracted.full_content || "").length / 5),
300
- };
301
- }
302
- }
303
- } catch (error) {
304
- console.error(
305
- `āŒ Error processing custom URL ${customUrl.url}:`,
306
- error.message
307
- );
308
- }
309
- }
310
-
311
- return files;
312
- }
313
-
314
- async function main() {
315
- console.log("šŸš€ Extract from Sitemap CLI");
316
-
317
- try {
318
- const config = await loadConfig();
319
- const apiKey = await getApiKey();
320
-
321
- // Ensure output directory exists
322
- mkdirSync(config.outDir, { recursive: true });
323
-
324
- // Load previous manifest
325
- const previousManifest = loadManifest(config.outDir);
326
- const currentFiles: string[] = [];
327
-
328
- let totalTokens = 0;
329
- let totalPages = 0;
330
- let totalErrors = 0;
331
-
332
- // Process each origin
333
- for (const origin of config.origins) {
334
- console.log(`\n🌐 Processing origin: ${origin}`);
335
-
336
- try {
337
- const result = await extractFromSitemap(
338
- origin,
339
- config.forceExtract,
340
- apiKey
341
- );
342
-
343
- console.log(
344
- `āœ… Extracted ${result.totalPages} pages with ${result.totalTokens} tokens`
345
- );
346
- if (result.errors > 0) {
347
- console.log(`āš ļø ${result.errors} errors occurred`);
348
- }
349
-
350
- // Write files to disk
351
- for (const [path, file] of Object.entries(result.files)) {
352
- let filename = path;
353
-
354
- if (!config.keepOriginalUrls) {
355
- // Create domain-specific subdirectory
356
- const domain = new URL(
357
- origin.startsWith("http") ? origin : `https://${origin}`
358
- ).hostname;
359
- const domainDir = join(config.outDir, domain);
360
- mkdirSync(domainDir, { recursive: true });
361
- filename = join(
362
- domain,
363
- path.startsWith("/") ? path.slice(1) : path
364
- );
365
- } else {
366
- filename = path.startsWith("/") ? path.slice(1) : path;
367
- }
368
-
369
- const filePath = join(config.outDir, filename);
370
- const fileDir = dirname(filePath);
371
-
372
- mkdirSync(fileDir, { recursive: true });
373
- writeFileSync(filePath, file.content);
374
- currentFiles.push(filename);
375
-
376
- console.log(`šŸ“ Wrote: ${filename} (${file.tokens} tokens)`);
377
- }
378
-
379
- totalTokens += result.totalTokens;
380
- totalPages += result.totalPages;
381
- totalErrors += result.errors;
382
- } catch (error) {
383
- console.error(`āŒ Error processing ${origin}:`, error.message);
384
- totalErrors++;
385
- }
386
- }
387
-
388
- // Process custom URLs
389
- if (config.customUrls.length > 0) {
390
- console.log(`\nšŸ“‹ Processing ${config.customUrls.length} custom URLs...`);
391
- const customFiles = await processCustomUrls(
392
- config.customUrls,
393
- apiKey,
394
- config.forceExtract
395
- );
396
-
397
- for (const [filename, file] of Object.entries(customFiles)) {
398
- const filePath = join(config.outDir, filename);
399
- writeFileSync(filePath, file.content);
400
- currentFiles.push(filename);
401
- totalTokens += file.tokens;
402
- totalPages++;
403
-
404
- console.log(`šŸ“ Wrote: ${filename} (${file.tokens} tokens)`);
405
- }
406
- }
407
-
408
- // Clean up old files
409
- if (previousManifest.files.length > 0) {
410
- cleanupOldFiles(config.outDir, currentFiles, previousManifest.files);
411
- }
412
-
413
- // Save new manifest
414
- const newManifest: Manifest = {
415
- files: currentFiles,
416
- timestamp: new Date().toISOString(),
417
- };
418
- saveManifest(config.outDir, newManifest);
419
-
420
- console.log(`\n✨ Extraction completed!`);
421
- console.log(`šŸ“Š Total: ${totalPages} pages, ${totalTokens} tokens`);
422
- if (totalErrors > 0) {
423
- console.log(`āš ļø Errors: ${totalErrors}`);
424
- }
425
- console.log(`šŸ“ Output directory: ${resolve(config.outDir)}`);
426
- } catch (error) {
427
- console.error("šŸ’„ Fatal error:", error.message);
428
- process.exit(1);
429
- }
430
- }
431
-
432
- if (import.meta.main) {
433
- main();
434
- }