extract-from-sitemap 0.0.2 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/{cli.ts → cli.js} +225 -196
  2. package/package.json +4 -5
@@ -1,52 +1,47 @@
1
- #!/usr/bin/env bun
2
- /// <reference types="@types/bun" />
3
- /// <reference lib="esnext" />
4
-
5
- import {
6
- existsSync,
7
- readFileSync,
8
- writeFileSync,
9
- mkdirSync,
10
- rmSync,
11
- readdirSync,
12
- } from "fs";
13
- import { join, dirname, resolve } from "path";
14
- import { extractFromSitemap } from "./mod.js";
15
- import { secrets } from "bun";
16
-
17
- interface Config {
18
- outDir: string;
19
- origins: string[];
20
- customUrls: Array<{
21
- title: string;
22
- description: string;
23
- url: string;
24
- }>;
25
- keepOriginalUrls: boolean;
26
- forceExtract: boolean;
27
- }
28
-
29
- interface Manifest {
30
- files: string[];
31
- timestamp: string;
32
- }
33
-
34
- const SECRETS_SERVICE = "extract-from-sitemap-cli";
35
- const SECRETS_KEY = "parallel-api-key";
36
-
1
+ #!/usr/bin/env node
2
+
3
+ const fs = require("fs");
4
+ const path = require("path");
5
+ const { spawn } = require("child_process");
6
+ const crypto = require("crypto");
7
+ const http = require("http");
8
+ const { URL, URLSearchParams } = require("url");
9
+ const os = require("os");
10
+ const { extractFromSitemap } = require("./mod.js");
11
+ /**
12
+ * @typedef {Object} Config
13
+ * @property {string} outDir - Output directory for extracted files
14
+ * @property {string[]} origins - Array of origin URLs to process
15
+ * @property {Array<{title: string, description: string, url: string}>} customUrls - Custom URLs to extract
16
+ * @property {boolean} keepOriginalUrls - Whether to keep original URL structure
17
+ * @property {boolean} forceExtract - Whether to force extraction even if files exist
18
+ */
19
+
20
+ /**
21
+ * @typedef {Object} Manifest
22
+ * @property {string[]} files - List of generated files
23
+ * @property {string} timestamp - Timestamp of last generation
24
+ */
25
+
26
+ const CREDENTIALS_DIR = path.join(os.homedir(), ".llmtext");
27
+ const API_KEY_FILE = path.join(CREDENTIALS_DIR, "api-key");
28
+
29
+ /**
30
+ * OAuth handler for Parallel.ai API key authentication
31
+ */
37
32
  class OAuth {
38
- private clientId: string;
39
- private redirectUri: string;
40
- private scope: string;
41
- private server?: Bun.Server;
42
-
43
33
  constructor() {
44
34
  this.clientId = "extract-from-sitemap-cli";
45
35
  this.redirectUri = "http://localhost:3737/callback";
46
36
  this.scope = "key:read";
37
+ this.server = null;
47
38
  }
48
39
 
49
- async getApiKey(): Promise<string> {
40
+ /**
41
+ * Get API key through OAuth flow
42
+ * @returns {Promise<string>} The API key
43
+ */
44
+ async getApiKey() {
50
45
  console.log("🔐 Starting OAuth flow...");
51
46
 
52
47
  // Generate PKCE parameters
@@ -62,7 +57,7 @@ class OAuth {
62
57
  authUrl.searchParams.set("code_challenge_method", "S256");
63
58
  authUrl.searchParams.set("state", Math.random().toString(36));
64
59
 
65
- console.log(`\n📖 Opening browser for authorization...`);
60
+ console.log("\n📖 Opening browser for authorization...");
66
61
 
67
62
  // Open browser automatically
68
63
  await this.openBrowser(authUrl.toString());
@@ -97,13 +92,14 @@ class OAuth {
97
92
  return access_token;
98
93
  }
99
94
 
100
- private async openBrowser(url: string): Promise<void> {
95
+ /**
96
+ * Open browser to authorization URL
97
+ * @param {string} url - The authorization URL
98
+ */
99
+ async openBrowser(url) {
101
100
  try {
102
- const { spawn } = require("child_process");
103
101
  const platform = process.platform;
104
-
105
- let command: string;
106
- let args: string[];
102
+ let command, args;
107
103
 
108
104
  if (platform === "darwin") {
109
105
  command = "open";
@@ -119,90 +115,93 @@ class OAuth {
119
115
 
120
116
  spawn(command, args, { detached: true, stdio: "ignore" });
121
117
  } catch (error) {
122
- console.log(`\n📖 Please visit this URL to authorize the application:`);
118
+ console.log("\n📖 Please visit this URL to authorize the application:");
123
119
  console.log(`${url}\n`);
124
120
  }
125
121
  }
126
122
 
127
- private async generatePKCE(): Promise<{
128
- codeVerifier: string;
129
- codeChallenge: string;
130
- }> {
131
- const codeVerifier = btoa(
132
- String.fromCharCode(...crypto.getRandomValues(new Uint8Array(32)))
133
- ).replace(/[+/=]/g, (m) => ({ "+": "-", "/": "_", "=": "" }[m]));
134
-
135
- const hash = await crypto.subtle.digest(
136
- "SHA-256",
137
- new TextEncoder().encode(codeVerifier)
138
- );
139
- const codeChallenge = btoa(
140
- String.fromCharCode(...new Uint8Array(hash))
141
- ).replace(/[+/=]/g, (m) => ({ "+": "-", "/": "_", "=": "" }[m]));
142
-
143
- return { codeVerifier, codeChallenge };
123
+ /**
124
+ * Generate PKCE code verifier and challenge
125
+ * @returns {Promise<{codeVerifier: string, codeChallenge: string}>}
126
+ */
127
+ async generatePKCE() {
128
+ const codeVerifier = crypto.randomBytes(32).toString("base64url");
129
+ const hash = crypto
130
+ .createHash("sha256")
131
+ .update(codeVerifier)
132
+ .digest("base64url");
133
+
134
+ return {
135
+ codeVerifier,
136
+ codeChallenge: hash,
137
+ };
144
138
  }
145
139
 
146
- private async startCallbackServer(): Promise<string> {
140
+ /**
141
+ * Start HTTP server to catch OAuth callback
142
+ * @returns {Promise<string>} The authorization code
143
+ */
144
+ async startCallbackServer() {
147
145
  return new Promise((resolve, reject) => {
148
- this.server = Bun.serve({
149
- port: 3737,
150
- fetch: (req) => {
151
- const url = new URL(req.url);
152
-
153
- if (url.pathname === "/callback") {
154
- const code = url.searchParams.get("code");
155
- const error = url.searchParams.get("error");
156
-
157
- if (error) {
158
- reject(new Error(`OAuth error: ${error}`));
159
- return new Response(
160
- "Error occurred. You can close this window.",
161
- { status: 400 }
162
- );
163
- }
164
-
165
- if (code) {
166
- resolve(code);
167
- // Don't stop server here - let the cleanup happen in the finally block
168
- return new Response(
169
- "✅ Authorization successful! You can close this window and return to the terminal.",
170
- {
171
- headers: {
172
- "Content-Type": "text/html",
173
- },
174
- }
175
- );
176
- }
146
+ this.server = http.createServer((req, res) => {
147
+ const url = new URL(req.url, `http://${req.headers.host}`);
148
+
149
+ if (url.pathname === "/callback") {
150
+ const code = url.searchParams.get("code");
151
+ const error = url.searchParams.get("error");
152
+
153
+ if (error) {
154
+ reject(new Error(`OAuth error: ${error}`));
155
+ res.writeHead(400);
156
+ res.end("Error occurred. You can close this window.");
157
+ return;
177
158
  }
178
159
 
179
- return new Response("Invalid request", { status: 404 });
180
- },
160
+ if (code) {
161
+ resolve(code);
162
+ res.writeHead(200, { "Content-Type": "text/html" });
163
+ res.end(
164
+ "✅ Authorization successful! You can close this window and return to the terminal."
165
+ );
166
+ return;
167
+ }
168
+ }
169
+
170
+ res.writeHead(404);
171
+ res.end("Invalid request");
181
172
  });
182
173
 
174
+ this.server.listen(3737);
175
+
183
176
  // Timeout after 5 minutes
184
177
  setTimeout(() => {
185
178
  this.stopServer();
186
179
  reject(new Error("OAuth flow timed out"));
187
180
  }, 300000);
188
181
  }).finally(() => {
189
- // Ensure server is stopped after promise resolves or rejects
190
182
  this.stopServer();
191
183
  });
192
184
  }
193
185
 
194
- private stopServer(): void {
186
+ /**
187
+ * Stop the callback server
188
+ */
189
+ stopServer() {
195
190
  if (this.server) {
196
- this.server.stop();
197
- this.server = undefined;
191
+ this.server.close();
192
+ this.server = null;
198
193
  }
199
194
  }
200
195
  }
201
196
 
202
- async function loadConfig(): Promise<Config> {
203
- const configPath = resolve("llmtext.json");
197
+ /**
198
+ * Load configuration from llmtext.json
199
+ * @returns {Promise<Config>} The configuration object
200
+ */
201
+ async function loadConfig() {
202
+ const configPath = path.resolve("llmtext.json");
204
203
 
205
- if (!existsSync(configPath)) {
204
+ if (!fs.existsSync(configPath)) {
206
205
  console.error(
207
206
  "❌ llmtext.json not found. Please create a configuration file."
208
207
  );
@@ -224,7 +223,7 @@ async function loadConfig(): Promise<Config> {
224
223
  }
225
224
 
226
225
  try {
227
- const config = JSON.parse(readFileSync(configPath, "utf8")) as Config;
226
+ const config = JSON.parse(fs.readFileSync(configPath, "utf8"));
228
227
 
229
228
  // Validate required fields
230
229
  if (!config.outDir) throw new Error("outDir is required");
@@ -243,28 +242,56 @@ async function loadConfig(): Promise<Config> {
243
242
  }
244
243
  }
245
244
 
246
- async function getApiKey(): Promise<string> {
247
- // Check if we have a stored API key in the keychain
245
+ /**
246
+ * Store API key in ~/.llmtext/api-key
247
+ * @param {string} apiKey - The API key to store
248
+ */
249
+ function storeApiKey(apiKey) {
248
250
  try {
249
- const storedKey = await secrets.get({
250
- service: SECRETS_SERVICE,
251
- name: SECRETS_KEY,
252
- });
251
+ fs.mkdirSync(CREDENTIALS_DIR, { recursive: true });
252
+ fs.writeFileSync(API_KEY_FILE, apiKey, { mode: 0o600 }); // Only owner can read
253
+ console.log("💾 API key stored securely in ~/.llmtext/api-key");
254
+ } catch (error) {
255
+ console.warn("⚠️ Could not store API key:", error.message);
256
+ }
257
+ }
253
258
 
254
- if (storedKey) {
255
- console.log("🔑 Using stored API key from keychain");
256
- return storedKey;
259
+ /**
260
+ * Load API key from ~/.llmtext/api-key
261
+ * @returns {string|null} The stored API key or null if not found
262
+ */
263
+ function loadStoredApiKey() {
264
+ try {
265
+ if (fs.existsSync(API_KEY_FILE)) {
266
+ const apiKey = fs.readFileSync(API_KEY_FILE, "utf8").trim();
267
+ if (apiKey) {
268
+ console.log("🔑 Using stored API key from ~/.llmtext/api-key");
269
+ return apiKey;
270
+ }
257
271
  }
258
272
  } catch (error) {
259
- console.warn("⚠️ Could not access keychain:", error.message);
273
+ console.warn("⚠️ Could not read stored API key:", error.message);
274
+ }
275
+ return null;
276
+ }
277
+
278
+ /**
279
+ * Get API key from various sources or start OAuth flow
280
+ * @returns {Promise<string>} The API key
281
+ */
282
+ async function getApiKey() {
283
+ // Check stored API key first
284
+ const storedKey = loadStoredApiKey();
285
+ if (storedKey) {
286
+ return storedKey;
260
287
  }
261
288
 
262
- // Check environment variables as fallback
289
+ // Check environment variables
263
290
  let apiKey = process.env.PARALLEL_API_KEY;
264
291
 
265
- if (!apiKey && existsSync(".env")) {
292
+ if (!apiKey && fs.existsSync(".env")) {
266
293
  // Try to load from .env file
267
- const envContent = readFileSync(".env", "utf8");
294
+ const envContent = fs.readFileSync(".env", "utf8");
268
295
  const match = envContent.match(/^PARALLEL_API_KEY=(.+)$/m);
269
296
  if (match) {
270
297
  apiKey = match[1].trim();
@@ -273,17 +300,7 @@ async function getApiKey(): Promise<string> {
273
300
 
274
301
  if (apiKey) {
275
302
  console.log("🔑 Using API key from environment");
276
- // Store it in keychain for future use
277
- try {
278
- await secrets.set({
279
- service: SECRETS_SERVICE,
280
- name: SECRETS_KEY,
281
- value: apiKey,
282
- });
283
- console.log("💾 API key stored in keychain for future use");
284
- } catch (error) {
285
- console.warn("⚠️ Could not store API key in keychain:", error.message);
286
- }
303
+ storeApiKey(apiKey);
287
304
  return apiKey;
288
305
  }
289
306
 
@@ -292,57 +309,55 @@ async function getApiKey(): Promise<string> {
292
309
  const oauth = new OAuth();
293
310
  const newApiKey = await oauth.getApiKey();
294
311
 
295
- // Store the new API key in keychain
296
- try {
297
- await secrets.set({
298
- service: SECRETS_SERVICE,
299
- name: SECRETS_KEY,
300
- value: newApiKey,
301
- });
302
- console.log("💾 API key stored securely in keychain");
303
- } catch (error) {
304
- console.warn("⚠️ Could not store API key in keychain:", error.message);
305
- console.log(
306
- "💡 You may need to set PARALLEL_API_KEY environment variable for future runs"
307
- );
308
- }
309
-
312
+ storeApiKey(newApiKey);
310
313
  return newApiKey;
311
314
  }
312
315
 
313
- function loadManifest(outDir: string): Manifest {
314
- const manifestPath = join(outDir, "llmtext-manifest.json");
316
+ /**
317
+ * Load manifest file
318
+ * @param {string} outDir - Output directory
319
+ * @returns {Manifest} The manifest object
320
+ */
321
+ function loadManifest(outDir) {
322
+ const manifestPath = path.join(outDir, "llmtext-manifest.json");
315
323
 
316
- if (!existsSync(manifestPath)) {
324
+ if (!fs.existsSync(manifestPath)) {
317
325
  return { files: [], timestamp: new Date().toISOString() };
318
326
  }
319
327
 
320
328
  try {
321
- return JSON.parse(readFileSync(manifestPath, "utf8"));
329
+ return JSON.parse(fs.readFileSync(manifestPath, "utf8"));
322
330
  } catch {
323
331
  return { files: [], timestamp: new Date().toISOString() };
324
332
  }
325
333
  }
326
334
 
327
- function saveManifest(outDir: string, manifest: Manifest): void {
328
- const manifestPath = join(outDir, "llmtext-manifest.json");
329
- writeFileSync(manifestPath, JSON.stringify(manifest, null, 2));
335
+ /**
336
+ * Save manifest file
337
+ * @param {string} outDir - Output directory
338
+ * @param {Manifest} manifest - The manifest to save
339
+ */
340
+ function saveManifest(outDir, manifest) {
341
+ const manifestPath = path.join(outDir, "llmtext-manifest.json");
342
+ fs.writeFileSync(manifestPath, JSON.stringify(manifest, null, 2));
330
343
  }
331
344
 
332
- function cleanupOldFiles(
333
- outDir: string,
334
- currentFiles: string[],
335
- previousFiles: string[]
336
- ): void {
345
+ /**
346
+ * Clean up old files that are no longer generated
347
+ * @param {string} outDir - Output directory
348
+ * @param {string[]} currentFiles - Currently generated files
349
+ * @param {string[]} previousFiles - Previously generated files
350
+ */
351
+ function cleanupOldFiles(outDir, currentFiles, previousFiles) {
337
352
  const filesToRemove = previousFiles.filter(
338
353
  (file) => !currentFiles.includes(file)
339
354
  );
340
355
 
341
356
  for (const file of filesToRemove) {
342
- const filePath = join(outDir, file);
357
+ const filePath = path.join(outDir, file);
343
358
  try {
344
- if (existsSync(filePath)) {
345
- rmSync(filePath);
359
+ if (fs.existsSync(filePath)) {
360
+ fs.rmSync(filePath);
346
361
  console.log(`🗑️ Removed old file: ${file}`);
347
362
  }
348
363
  } catch (error) {
@@ -351,18 +366,20 @@ function cleanupOldFiles(
351
366
  }
352
367
  }
353
368
 
354
- async function processCustomUrls(
355
- customUrls: Array<{ title: string; description: string; url: string }>,
356
- apiKey: string,
357
- forceExtract: boolean
358
- ): Promise<Record<string, any>> {
359
- const files: Record<string, any> = {};
369
+ /**
370
+ * Process custom URLs through extraction API
371
+ * @param {Array<{title: string, description: string, url: string}>} customUrls - Custom URLs to process
372
+ * @param {string} apiKey - API key for authentication
373
+ * @param {boolean} forceExtract - Whether to force extraction
374
+ * @returns {Promise<Record<string, any>>} Extracted files
375
+ */
376
+ async function processCustomUrls(customUrls, apiKey, forceExtract) {
377
+ const files = {};
360
378
 
361
379
  for (const customUrl of customUrls) {
362
380
  console.log(`📄 Processing custom URL: ${customUrl.url}`);
363
381
 
364
382
  try {
365
- // For custom URLs, we need to extract them individually
366
383
  const response = await fetch("https://api.parallel.ai/v1beta/extract", {
367
384
  method: "POST",
368
385
  headers: {
@@ -393,6 +410,8 @@ async function processCustomUrls(
393
410
  tokens: Math.round((extracted.full_content || "").length / 5),
394
411
  };
395
412
  }
413
+ } else {
414
+ throw new Error(`${response.status} - ${await response.statusText()}`);
396
415
  }
397
416
  } catch (error) {
398
417
  console.error(
@@ -405,16 +424,14 @@ async function processCustomUrls(
405
424
  return files;
406
425
  }
407
426
 
408
- // Add command for clearing stored credentials
409
- async function clearCredentials(): Promise<void> {
427
+ /**
428
+ * Clear stored API key credentials
429
+ */
430
+ async function clearCredentials() {
410
431
  try {
411
- const deleted = await secrets.delete({
412
- service: SECRETS_SERVICE,
413
- name: SECRETS_KEY,
414
- });
415
-
416
- if (deleted) {
417
- console.log("✅ Cleared stored API key from keychain");
432
+ if (fs.existsSync(API_KEY_FILE)) {
433
+ fs.unlinkSync(API_KEY_FILE);
434
+ console.log("✅ Cleared stored API key from ~/.llmtext/api-key");
418
435
  } else {
419
436
  console.log("ℹ️ No stored API key found to clear");
420
437
  }
@@ -423,6 +440,9 @@ async function clearCredentials(): Promise<void> {
423
440
  }
424
441
  }
425
442
 
443
+ /**
444
+ * Main function
445
+ */
426
446
  async function main() {
427
447
  console.log("🚀 Extract from Sitemap CLI");
428
448
 
@@ -438,11 +458,11 @@ async function main() {
438
458
  const apiKey = await getApiKey();
439
459
 
440
460
  // Ensure output directory exists
441
- mkdirSync(config.outDir, { recursive: true });
461
+ fs.mkdirSync(config.outDir, { recursive: true });
442
462
 
443
463
  // Load previous manifest
444
464
  const previousManifest = loadManifest(config.outDir);
445
- const currentFiles: string[] = [];
465
+ const currentFiles = [];
446
466
 
447
467
  let totalTokens = 0;
448
468
  let totalPages = 0;
@@ -467,29 +487,29 @@ async function main() {
467
487
  }
468
488
 
469
489
  // Write files to disk
470
- for (const [path, file] of Object.entries(result.files)) {
471
- let filename = path;
490
+ for (const [filePath, file] of Object.entries(result.files)) {
491
+ let filename = filePath;
472
492
 
473
493
  if (!config.keepOriginalUrls) {
474
494
  // Create domain-specific subdirectory
475
495
  const domain = new URL(
476
496
  origin.startsWith("http") ? origin : `https://${origin}`
477
497
  ).hostname;
478
- const domainDir = join(config.outDir, domain);
479
- mkdirSync(domainDir, { recursive: true });
480
- filename = join(
498
+ const domainDir = path.join(config.outDir, domain);
499
+ fs.mkdirSync(domainDir, { recursive: true });
500
+ filename = path.join(
481
501
  domain,
482
- path.startsWith("/") ? path.slice(1) : path
502
+ filePath.startsWith("/") ? filePath.slice(1) : filePath
483
503
  );
484
504
  } else {
485
- filename = path.startsWith("/") ? path.slice(1) : path;
505
+ filename = filePath.startsWith("/") ? filePath.slice(1) : filePath;
486
506
  }
487
507
 
488
- const filePath = join(config.outDir, filename);
489
- const fileDir = dirname(filePath);
508
+ const fullFilePath = path.join(config.outDir, filename);
509
+ const fileDir = path.dirname(fullFilePath);
490
510
 
491
- mkdirSync(fileDir, { recursive: true });
492
- writeFileSync(filePath, file.content);
511
+ fs.mkdirSync(fileDir, { recursive: true });
512
+ fs.writeFileSync(fullFilePath, file.content);
493
513
  currentFiles.push(filename);
494
514
 
495
515
  console.log(`📝 Wrote: ${filename} (${file.tokens} tokens)`);
@@ -514,8 +534,8 @@ async function main() {
514
534
  );
515
535
 
516
536
  for (const [filename, file] of Object.entries(customFiles)) {
517
- const filePath = join(config.outDir, filename);
518
- writeFileSync(filePath, file.content);
537
+ const filePath = path.join(config.outDir, filename);
538
+ fs.writeFileSync(filePath, file.content);
519
539
  currentFiles.push(filename);
520
540
  totalTokens += file.tokens;
521
541
  totalPages++;
@@ -530,25 +550,34 @@ async function main() {
530
550
  }
531
551
 
532
552
  // Save new manifest
533
- const newManifest: Manifest = {
553
+ const newManifest = {
534
554
  files: currentFiles,
535
555
  timestamp: new Date().toISOString(),
536
556
  };
537
557
  saveManifest(config.outDir, newManifest);
538
558
 
539
- console.log(`\n✨ Extraction completed!`);
559
+ console.log("\n✨ Extraction completed!");
540
560
  console.log(`📊 Total: ${totalPages} pages, ${totalTokens} tokens`);
541
561
  if (totalErrors > 0) {
542
562
  console.log(`⚠️ Errors: ${totalErrors}`);
543
563
  }
544
- console.log(`📁 Output directory: ${resolve(config.outDir)}`);
545
- console.log(`\n💡 Use --clear-credentials to remove stored API key`);
564
+ console.log(`📁 Output directory: ${path.resolve(config.outDir)}`);
565
+ console.log("\n💡 Use --clear-credentials to remove stored API key");
546
566
  } catch (error) {
547
567
  console.error("💥 Fatal error:", error.message);
548
568
  process.exit(1);
549
569
  }
550
570
  }
551
571
 
552
- if (import.meta.main) {
572
+ // Run main function if this file is executed directly
573
+ if (require.main === module) {
553
574
  main();
554
575
  }
576
+
577
+ module.exports = {
578
+ OAuth,
579
+ loadConfig,
580
+ getApiKey,
581
+ clearCredentials,
582
+ main,
583
+ };
package/package.json CHANGED
@@ -1,16 +1,15 @@
1
1
  {
2
2
  "name": "extract-from-sitemap",
3
- "bin": "cli.ts",
4
- "version": "0.0.2",
3
+ "bin": "cli.js",
4
+ "version": "0.0.4",
5
5
  "main": "mod.js",
6
6
  "description": "A module and CLI that allows extracting all pages from a sitemap into markdown and a llms.txt, using Parallel.ai APIs.",
7
7
  "files": [
8
8
  "mod.js",
9
- "cli.ts"
9
+ "cli.js"
10
10
  ],
11
11
  "license": "MIT",
12
12
  "devDependencies": {
13
- "@cloudflare/workers-types": "4.20251011.0",
14
- "@types/bun": "1.3.0"
13
+ "@cloudflare/workers-types": "4.20251011.0"
15
14
  }
16
15
  }