extract-from-sitemap 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. package/cli.ts +434 -0
  2. package/mod.js +595 -0
  3. package/package.json +16 -0
package/cli.ts ADDED
@@ -0,0 +1,434 @@
1
+ #!/usr/bin/env bun
2
+ /// <reference types="@types/bun" />
3
+ /// <reference lib="esnext" />
4
+
5
+ import {
6
+ existsSync,
7
+ readFileSync,
8
+ writeFileSync,
9
+ mkdirSync,
10
+ rmSync,
11
+ readdirSync,
12
+ } from "fs";
13
+ import { join, dirname, resolve } from "path";
14
+ import { extractFromSitemap } from "./mod.js";
15
+
16
+ interface Config {
17
+ outDir: string;
18
+ origins: string[];
19
+ customUrls: Array<{
20
+ title: string;
21
+ description: string;
22
+ url: string;
23
+ }>;
24
+ keepOriginalUrls: boolean;
25
+ forceExtract: boolean;
26
+ }
27
+
28
+ interface Manifest {
29
+ files: string[];
30
+ timestamp: string;
31
+ }
32
+
33
+ class OAuth {
34
+ private clientId: string;
35
+ private redirectUri: string;
36
+ private scope: string;
37
+
38
+ constructor() {
39
+ this.clientId = "extract-from-sitemap-cli";
40
+ this.redirectUri = "http://localhost:3737/callback";
41
+ this.scope = "key:read";
42
+ }
43
+
44
+ async getApiKey(): Promise<string> {
45
+ console.log("šŸ” Starting OAuth flow...");
46
+
47
+ // Generate PKCE parameters
48
+ const { codeVerifier, codeChallenge } = await this.generatePKCE();
49
+
50
+ // Build authorization URL
51
+ const authUrl = new URL("https://platform.parallel.ai/getKeys/authorize");
52
+ authUrl.searchParams.set("client_id", this.clientId);
53
+ authUrl.searchParams.set("redirect_uri", this.redirectUri);
54
+ authUrl.searchParams.set("response_type", "code");
55
+ authUrl.searchParams.set("scope", this.scope);
56
+ authUrl.searchParams.set("code_challenge", codeChallenge);
57
+ authUrl.searchParams.set("code_challenge_method", "S256");
58
+ authUrl.searchParams.set("state", Math.random().toString(36));
59
+
60
+ console.log(`\nšŸ“– Please visit this URL to authorize the application:`);
61
+ console.log(`${authUrl.toString()}\n`);
62
+
63
+ // Start simple HTTP server to catch the callback
64
+ const code = await this.startCallbackServer();
65
+
66
+ // Exchange code for token
67
+ console.log("šŸ”„ Exchanging authorization code for API key...");
68
+
69
+ const response = await fetch("https://platform.parallel.ai/getKeys/token", {
70
+ method: "POST",
71
+ headers: { "Content-Type": "application/x-www-form-urlencoded" },
72
+ body: new URLSearchParams({
73
+ grant_type: "authorization_code",
74
+ code: code,
75
+ client_id: this.clientId,
76
+ redirect_uri: this.redirectUri,
77
+ code_verifier: codeVerifier,
78
+ }),
79
+ });
80
+
81
+ if (!response.ok) {
82
+ throw new Error(
83
+ `Token exchange failed: ${response.status} ${response.statusText}`
84
+ );
85
+ }
86
+
87
+ const { access_token } = await response.json();
88
+ console.log("āœ… Successfully obtained API key!");
89
+
90
+ return access_token;
91
+ }
92
+
93
+ private async generatePKCE(): Promise<{
94
+ codeVerifier: string;
95
+ codeChallenge: string;
96
+ }> {
97
+ const codeVerifier = btoa(
98
+ String.fromCharCode(...crypto.getRandomValues(new Uint8Array(32)))
99
+ ).replace(/[+/=]/g, (m) => ({ "+": "-", "/": "_", "=": "" }[m]));
100
+
101
+ const hash = await crypto.subtle.digest(
102
+ "SHA-256",
103
+ new TextEncoder().encode(codeVerifier)
104
+ );
105
+ const codeChallenge = btoa(
106
+ String.fromCharCode(...new Uint8Array(hash))
107
+ ).replace(/[+/=]/g, (m) => ({ "+": "-", "/": "_", "=": "" }[m]));
108
+
109
+ return { codeVerifier, codeChallenge };
110
+ }
111
+
112
+ private async startCallbackServer(): Promise<string> {
113
+ return new Promise((resolve, reject) => {
114
+ const server = Bun.serve({
115
+ port: 3737,
116
+ fetch(req) {
117
+ const url = new URL(req.url);
118
+
119
+ if (url.pathname === "/callback") {
120
+ const code = url.searchParams.get("code");
121
+ const error = url.searchParams.get("error");
122
+
123
+ if (error) {
124
+ reject(new Error(`OAuth error: ${error}`));
125
+ return new Response(
126
+ "Error occurred. You can close this window.",
127
+ { status: 400 }
128
+ );
129
+ }
130
+
131
+ if (code) {
132
+ resolve(code);
133
+ server.stop();
134
+ return new Response(
135
+ "āœ… Authorization successful! You can close this window and return to the terminal."
136
+ );
137
+ }
138
+ }
139
+
140
+ return new Response("Invalid request", { status: 404 });
141
+ },
142
+ });
143
+
144
+ // Timeout after 5 minutes
145
+ setTimeout(() => {
146
+ server.stop();
147
+ reject(new Error("OAuth flow timed out"));
148
+ }, 300000);
149
+ });
150
+ }
151
+ }
152
+
153
+ async function loadConfig(): Promise<Config> {
154
+ const configPath = resolve("llmtext.json");
155
+
156
+ if (!existsSync(configPath)) {
157
+ console.error(
158
+ "āŒ llmtext.json not found. Please create a configuration file."
159
+ );
160
+ console.log("\nExample llmtext.json:");
161
+ console.log(
162
+ JSON.stringify(
163
+ {
164
+ outDir: "./docs",
165
+ origins: ["https://docs.example.com"],
166
+ customUrls: [],
167
+ keepOriginalUrls: false,
168
+ forceExtract: false,
169
+ },
170
+ null,
171
+ 2
172
+ )
173
+ );
174
+ process.exit(1);
175
+ }
176
+
177
+ try {
178
+ const config = JSON.parse(readFileSync(configPath, "utf8")) as Config;
179
+
180
+ // Validate required fields
181
+ if (!config.outDir) throw new Error("outDir is required");
182
+ if (!Array.isArray(config.origins))
183
+ throw new Error("origins must be an array");
184
+
185
+ // Set defaults
186
+ config.customUrls = config.customUrls || [];
187
+ config.keepOriginalUrls = config.keepOriginalUrls ?? false;
188
+ config.forceExtract = config.forceExtract ?? false;
189
+
190
+ return config;
191
+ } catch (error) {
192
+ console.error("āŒ Error reading llmtext.json:", error.message);
193
+ process.exit(1);
194
+ }
195
+ }
196
+
197
+ async function getApiKey(): Promise<string> {
198
+ // Check environment variables first
199
+ let apiKey = process.env.PARALLEL_API_KEY;
200
+
201
+ if (!apiKey && existsSync(".env")) {
202
+ // Try to load from .env file
203
+ const envContent = readFileSync(".env", "utf8");
204
+ const match = envContent.match(/^PARALLEL_API_KEY=(.+)$/m);
205
+ if (match) {
206
+ apiKey = match[1].trim();
207
+ }
208
+ }
209
+
210
+ if (!apiKey) {
211
+ console.log("šŸ”‘ No API key found in environment or .env file.");
212
+ const oauth = new OAuth();
213
+ apiKey = await oauth.getApiKey();
214
+ }
215
+
216
+ return apiKey;
217
+ }
218
+
219
+ function loadManifest(outDir: string): Manifest {
220
+ const manifestPath = join(outDir, "llmtext-manifest.json");
221
+
222
+ if (!existsSync(manifestPath)) {
223
+ return { files: [], timestamp: new Date().toISOString() };
224
+ }
225
+
226
+ try {
227
+ return JSON.parse(readFileSync(manifestPath, "utf8"));
228
+ } catch {
229
+ return { files: [], timestamp: new Date().toISOString() };
230
+ }
231
+ }
232
+
233
+ function saveManifest(outDir: string, manifest: Manifest): void {
234
+ const manifestPath = join(outDir, "llmtext-manifest.json");
235
+ writeFileSync(manifestPath, JSON.stringify(manifest, null, 2));
236
+ }
237
+
238
+ function cleanupOldFiles(
239
+ outDir: string,
240
+ currentFiles: string[],
241
+ previousFiles: string[]
242
+ ): void {
243
+ const filesToRemove = previousFiles.filter(
244
+ (file) => !currentFiles.includes(file)
245
+ );
246
+
247
+ for (const file of filesToRemove) {
248
+ const filePath = join(outDir, file);
249
+ try {
250
+ if (existsSync(filePath)) {
251
+ rmSync(filePath);
252
+ console.log(`šŸ—‘ļø Removed old file: ${file}`);
253
+ }
254
+ } catch (error) {
255
+ console.warn(`āš ļø Could not remove ${file}:`, error.message);
256
+ }
257
+ }
258
+ }
259
+
260
+ async function processCustomUrls(
261
+ customUrls: Array<{ title: string; description: string; url: string }>,
262
+ apiKey: string,
263
+ forceExtract: boolean
264
+ ): Promise<Record<string, any>> {
265
+ const files: Record<string, any> = {};
266
+
267
+ for (const customUrl of customUrls) {
268
+ console.log(`šŸ“„ Processing custom URL: ${customUrl.url}`);
269
+
270
+ try {
271
+ // For custom URLs, we need to extract them individually
272
+ const response = await fetch("https://api.parallel.ai/v1beta/extract", {
273
+ method: "POST",
274
+ headers: {
275
+ "Content-Type": "application/json",
276
+ "parallel-beta": "search-extract-2025-10-10",
277
+ "x-api-key": apiKey,
278
+ },
279
+ body: JSON.stringify({
280
+ urls: [customUrl.url],
281
+ full_content: true,
282
+ }),
283
+ });
284
+
285
+ if (response.ok) {
286
+ const result = await response.json();
287
+ if (result.results && result.results.length > 0) {
288
+ const extracted = result.results[0];
289
+ const filename =
290
+ customUrl.title.replace(/[^a-zA-Z0-9]/g, "_").toLowerCase() + ".md";
291
+
292
+ files[filename] = {
293
+ content: extracted.full_content || "",
294
+ title: customUrl.title,
295
+ description: customUrl.description,
296
+ extracted: true,
297
+ publishedDate: extracted.published_date || "",
298
+ status: 200,
299
+ tokens: Math.round((extracted.full_content || "").length / 5),
300
+ };
301
+ }
302
+ }
303
+ } catch (error) {
304
+ console.error(
305
+ `āŒ Error processing custom URL ${customUrl.url}:`,
306
+ error.message
307
+ );
308
+ }
309
+ }
310
+
311
+ return files;
312
+ }
313
+
314
+ async function main() {
315
+ console.log("šŸš€ Extract from Sitemap CLI");
316
+
317
+ try {
318
+ const config = await loadConfig();
319
+ const apiKey = await getApiKey();
320
+
321
+ // Ensure output directory exists
322
+ mkdirSync(config.outDir, { recursive: true });
323
+
324
+ // Load previous manifest
325
+ const previousManifest = loadManifest(config.outDir);
326
+ const currentFiles: string[] = [];
327
+
328
+ let totalTokens = 0;
329
+ let totalPages = 0;
330
+ let totalErrors = 0;
331
+
332
+ // Process each origin
333
+ for (const origin of config.origins) {
334
+ console.log(`\n🌐 Processing origin: ${origin}`);
335
+
336
+ try {
337
+ const result = await extractFromSitemap(
338
+ origin,
339
+ config.forceExtract,
340
+ apiKey
341
+ );
342
+
343
+ console.log(
344
+ `āœ… Extracted ${result.totalPages} pages with ${result.totalTokens} tokens`
345
+ );
346
+ if (result.errors > 0) {
347
+ console.log(`āš ļø ${result.errors} errors occurred`);
348
+ }
349
+
350
+ // Write files to disk
351
+ for (const [path, file] of Object.entries(result.files)) {
352
+ let filename = path;
353
+
354
+ if (!config.keepOriginalUrls) {
355
+ // Create domain-specific subdirectory
356
+ const domain = new URL(
357
+ origin.startsWith("http") ? origin : `https://${origin}`
358
+ ).hostname;
359
+ const domainDir = join(config.outDir, domain);
360
+ mkdirSync(domainDir, { recursive: true });
361
+ filename = join(
362
+ domain,
363
+ path.startsWith("/") ? path.slice(1) : path
364
+ );
365
+ } else {
366
+ filename = path.startsWith("/") ? path.slice(1) : path;
367
+ }
368
+
369
+ const filePath = join(config.outDir, filename);
370
+ const fileDir = dirname(filePath);
371
+
372
+ mkdirSync(fileDir, { recursive: true });
373
+ writeFileSync(filePath, file.content);
374
+ currentFiles.push(filename);
375
+
376
+ console.log(`šŸ“ Wrote: ${filename} (${file.tokens} tokens)`);
377
+ }
378
+
379
+ totalTokens += result.totalTokens;
380
+ totalPages += result.totalPages;
381
+ totalErrors += result.errors;
382
+ } catch (error) {
383
+ console.error(`āŒ Error processing ${origin}:`, error.message);
384
+ totalErrors++;
385
+ }
386
+ }
387
+
388
+ // Process custom URLs
389
+ if (config.customUrls.length > 0) {
390
+ console.log(`\nšŸ“‹ Processing ${config.customUrls.length} custom URLs...`);
391
+ const customFiles = await processCustomUrls(
392
+ config.customUrls,
393
+ apiKey,
394
+ config.forceExtract
395
+ );
396
+
397
+ for (const [filename, file] of Object.entries(customFiles)) {
398
+ const filePath = join(config.outDir, filename);
399
+ writeFileSync(filePath, file.content);
400
+ currentFiles.push(filename);
401
+ totalTokens += file.tokens;
402
+ totalPages++;
403
+
404
+ console.log(`šŸ“ Wrote: ${filename} (${file.tokens} tokens)`);
405
+ }
406
+ }
407
+
408
+ // Clean up old files
409
+ if (previousManifest.files.length > 0) {
410
+ cleanupOldFiles(config.outDir, currentFiles, previousManifest.files);
411
+ }
412
+
413
+ // Save new manifest
414
+ const newManifest: Manifest = {
415
+ files: currentFiles,
416
+ timestamp: new Date().toISOString(),
417
+ };
418
+ saveManifest(config.outDir, newManifest);
419
+
420
+ console.log(`\n✨ Extraction completed!`);
421
+ console.log(`šŸ“Š Total: ${totalPages} pages, ${totalTokens} tokens`);
422
+ if (totalErrors > 0) {
423
+ console.log(`āš ļø Errors: ${totalErrors}`);
424
+ }
425
+ console.log(`šŸ“ Output directory: ${resolve(config.outDir)}`);
426
+ } catch (error) {
427
+ console.error("šŸ’„ Fatal error:", error.message);
428
+ process.exit(1);
429
+ }
430
+ }
431
+
432
+ if (import.meta.main) {
433
+ main();
434
+ }
package/mod.js ADDED
@@ -0,0 +1,595 @@
1
+ /**
2
+ * @typedef {Object} FileResult
3
+ * @property {string} [error] - Error message if file processing failed
4
+ * @property {string} content - The extracted or fetched content of the file
5
+ * @property {string} publishedDate - The published date of the file/document
6
+ * @property {string} title - The title of the file/document
7
+ * @property {string} description - The description of the file/document
8
+ * @property {boolean} extracted - Whether the content was extracted or directly fetched
9
+ * @property {number} status - HTTP status code or processing status
10
+ * @property {number} tokens - Number of tokens in the content
11
+ */
12
+
13
+ /**
14
+ * @typedef {Object} ResponseData
15
+ * @property {Record<string, FileResult>} files - Map of file identifiers to their results
16
+ * @property {number} totalTokens - Total number of tokens across all files
17
+ * @property {number} totalPages - Total number of pages processed
18
+ * @property {number} errors - Number of errors encountered during processing
19
+ * @property {number} processingTimeMs - Total processing time in milliseconds
20
+ * @property {number} extractApiCallCount - Number of API calls made for content extraction
21
+ * @property {number} fetchCount - Number of fetch operations performed
22
+ */
23
+
24
+ /**
25
+ * Extract content from sitemap URLs with markdown variant detection
26
+ * @param {string} origin - The origin URL to extract from
27
+ * @param {boolean} forceExtract - Whether to force using extract API instead of markdown variants
28
+ * @param {string} apiKey - Parallel API key
29
+ * @returns {Promise<ResponseData>}
30
+ */
31
+ export async function extractFromSitemap(origin, forceExtract = false, apiKey) {
32
+ const startTime = Date.now();
33
+ let fetchCount = 0;
34
+ let extractApiCallCount = 0;
35
+
36
+ // Discover sitemap
37
+ const sitemapUrl = await discoverSitemap(origin);
38
+ if (!sitemapUrl) {
39
+ throw new Error(`Could not find sitemap for ${origin}`);
40
+ }
41
+
42
+ // Parse sitemap and get URLs
43
+ const urls = await parseSitemap(sitemapUrl);
44
+
45
+ // Process each URL
46
+ const files = {};
47
+ const urlsNeedingExtract = [];
48
+
49
+ // Fetch all URLs with markdown variant detection
50
+ await Promise.all(
51
+ urls.map(async (urlStr) => {
52
+ try {
53
+ const result = await fetchUrlContent(urlStr, forceExtract);
54
+ fetchCount += result.fetchCount;
55
+
56
+ const path = getPathFromUrl(urlStr) + ".md";
57
+ files[path] = {
58
+ content: result.content,
59
+ title: cleanTitle(result.title, origin),
60
+ description: cleanDescription(result.description, result.title),
61
+ extracted: false,
62
+ status: result.status,
63
+ tokens: Math.round(result.content.length / 5),
64
+ publishedDate: result.publishedDate || "",
65
+ error: result.error,
66
+ };
67
+
68
+ // Track URLs that need Extract API fallback
69
+ if (!result.content || result.error) {
70
+ urlsNeedingExtract.push(urlStr);
71
+ }
72
+ } catch (error) {
73
+ const path = getPathFromUrl(urlStr) + ".md";
74
+ files[path] = {
75
+ error: error instanceof Error ? error.message : "Unknown error",
76
+ content: "",
77
+ title: "",
78
+ description: "",
79
+ extracted: false,
80
+ status: 0,
81
+ tokens: 0,
82
+ publishedDate: "",
83
+ };
84
+ if (!forceExtract) {
85
+ urlsNeedingExtract.push(urlStr);
86
+ }
87
+ }
88
+ })
89
+ );
90
+
91
+ // Use Parallel Extract API for URLs that didn't return content
92
+ if (urlsNeedingExtract.length > 0 && apiKey) {
93
+ try {
94
+ extractApiCallCount = 1;
95
+ const extractResults = await callParallelExtractAPI(
96
+ urlsNeedingExtract,
97
+ apiKey
98
+ );
99
+
100
+ // Merge extract results
101
+ for (const result of extractResults.results) {
102
+ const path = getPathFromUrl(result.url) + ".md";
103
+ const existing = files[path] || {
104
+ content: "",
105
+ title: "",
106
+ description: "",
107
+ extracted: false,
108
+ status: 0,
109
+ tokens: 0,
110
+ publishedDate: "",
111
+ };
112
+
113
+ const content = result.full_content || existing.content;
114
+ files[path] = {
115
+ content,
116
+ title: cleanTitle(result.title || existing.title, origin),
117
+ description: cleanDescription(
118
+ existing.description,
119
+ result.title || existing.title
120
+ ),
121
+ extracted: !!result.full_content,
122
+ publishedDate: result.published_date || existing.publishedDate,
123
+ status: existing.status,
124
+ tokens: Math.round(content.length / 5),
125
+ };
126
+ }
127
+
128
+ // Handle extract errors
129
+ for (const error of extractResults.errors) {
130
+ const path = getPathFromUrl(error.url) + ".md";
131
+ if (files[path]) {
132
+ files[path].error = error.message;
133
+ }
134
+ }
135
+ } catch (error) {
136
+ console.error("Extract API error:", error);
137
+ }
138
+ }
139
+
140
+ // Generate llms.txt
141
+ const llmsTxt = generateLlmsTxt(origin, files);
142
+ files["/llms.txt"] = {
143
+ content: llmsTxt,
144
+ title: "LLMs.txt",
145
+ description: "LLM-friendly content listing",
146
+ extracted: false,
147
+ publishedDate: "",
148
+ status: 200,
149
+ tokens: Math.round(llmsTxt.length / 5),
150
+ };
151
+
152
+ // Sort files by path
153
+ const sortedFiles = Object.keys(files)
154
+ .sort()
155
+ .reduce((acc, key) => {
156
+ acc[key] = files[key];
157
+ return acc;
158
+ }, {});
159
+
160
+ // Calculate totals
161
+ const totalTokens = Object.values(sortedFiles).reduce(
162
+ (sum, file) => sum + file.tokens,
163
+ 0
164
+ );
165
+ const totalPages = Object.keys(sortedFiles).length - 1; // Exclude llms.txt from page count
166
+ const errors = Object.values(sortedFiles).filter((file) => file.error).length;
167
+ const processingTimeMs = Date.now() - startTime;
168
+
169
+ return {
170
+ files: sortedFiles,
171
+ totalTokens,
172
+ totalPages,
173
+ errors,
174
+ processingTimeMs,
175
+ extractApiCallCount,
176
+ fetchCount,
177
+ };
178
+ }
179
+
180
+ /**
181
+ * Clean title by removing site name duplicates
182
+ * @param {string} title - Original title
183
+ * @param {string} origin - Site origin
184
+ * @returns {string} Cleaned title
185
+ */
186
+ function cleanTitle(title, origin) {
187
+ if (!title) return "";
188
+
189
+ // Extract domain name from origin
190
+ const domain = new URL(origin).hostname.replace(/^www\./, "");
191
+ const siteName = domain.split(".")[0];
192
+
193
+ // Remove common site name patterns from end of title
194
+ const patterns = [
195
+ new RegExp(`\\s*[-|•]\\s*${siteName}\\s*$`, "i"),
196
+ new RegExp(`\\s*[-|•]\\s*${domain}\\s*$`, "i"),
197
+ /\s*[-|•]\s*Home\s*$/i,
198
+ /\s*[-|•]\s*Documentation\s*$/i,
199
+ ];
200
+
201
+ let cleaned = title;
202
+ for (const pattern of patterns) {
203
+ cleaned = cleaned.replace(pattern, "");
204
+ }
205
+
206
+ return cleaned.trim();
207
+ }
208
+
209
+ /**
210
+ * Clean description by removing title duplicates
211
+ * @param {string} description - Original description
212
+ * @param {string} title - Page title
213
+ * @returns {string} Cleaned description
214
+ */
215
+ function cleanDescription(description, title) {
216
+ if (!description || !title) return description || "";
217
+
218
+ // Remove title from beginning of description if it's a duplicate
219
+ if (description.toLowerCase().startsWith(title.toLowerCase())) {
220
+ return description
221
+ .substring(title.length)
222
+ .replace(/^[.\s-]+/, "")
223
+ .trim();
224
+ }
225
+
226
+ return description;
227
+ }
228
+
229
+ /**
230
+ * Discover sitemap URL for a given origin
231
+ * @param {string} origin - The origin to search for sitemap
232
+ * @returns {Promise<string|null>} Sitemap URL or null if not found
233
+ */
234
+ async function discoverSitemap(origin) {
235
+ // Ensure origin has protocol
236
+ const baseUrl = origin.startsWith("http") ? origin : `https://${origin}`;
237
+ const domain = new URL(baseUrl).origin;
238
+
239
+ // Try common sitemap locations
240
+ const candidates = [
241
+ `${domain}/sitemap.xml`,
242
+ `${domain}/sitemap_index.xml`,
243
+ `${domain}/sitemap-index.xml`,
244
+ `${domain}/sitemap1.xml`,
245
+ ];
246
+
247
+ // Also check robots.txt
248
+ try {
249
+ const robotsRes = await fetch(`${domain}/robots.txt`, {
250
+ headers: { "User-Agent": "sitemap-to-llmtext-bot/1.0" },
251
+ });
252
+ if (robotsRes.ok) {
253
+ const robotsTxt = await robotsRes.text();
254
+ const sitemapMatch = robotsTxt.match(/Sitemap:\s*(.+)/i);
255
+ if (sitemapMatch) {
256
+ candidates.unshift(sitemapMatch[1].trim());
257
+ }
258
+ }
259
+ } catch {}
260
+
261
+ // Test each candidate
262
+ for (const candidate of candidates) {
263
+ try {
264
+ const res = await fetch(candidate, {
265
+ headers: { "User-Agent": "sitemap-to-llmtext-bot/1.0" },
266
+ });
267
+ if (res.ok) {
268
+ const contentType = res.headers.get("content-type") || "";
269
+ if (contentType.includes("xml") || contentType.includes("text")) {
270
+ return candidate;
271
+ }
272
+ }
273
+ } catch {}
274
+ }
275
+
276
+ return null;
277
+ }
278
+
279
+ /**
280
+ * Parse sitemap XML and extract URLs
281
+ * @param {string} sitemapUrl - URL of the sitemap
282
+ * @returns {Promise<string[]>} Array of URLs found in sitemap
283
+ */
284
+ async function parseSitemap(sitemapUrl) {
285
+ const res = await fetch(sitemapUrl, {
286
+ headers: { "User-Agent": "sitemap-to-llmtext-bot/1.0" },
287
+ });
288
+
289
+ if (!res.ok) {
290
+ throw new Error(`Failed to fetch sitemap: ${res.status}`);
291
+ }
292
+
293
+ const xml = await res.text();
294
+ const urls = [];
295
+
296
+ // Check if this is a sitemap index
297
+ const sitemapPattern =
298
+ /<sitemap>[\s\S]*?<loc>(.+?)<\/loc>[\s\S]*?<\/sitemap>/gi;
299
+ const sitemapMatches = xml.matchAll(sitemapPattern);
300
+ const childSitemaps = Array.from(sitemapMatches, (m) => m[1]);
301
+
302
+ if (childSitemaps.length > 0) {
303
+ // Recursively parse child sitemaps
304
+ const childUrls = await Promise.all(
305
+ childSitemaps.map((url) => parseSitemap(url))
306
+ );
307
+ return childUrls.flat();
308
+ }
309
+
310
+ // Parse regular sitemap
311
+ const urlPattern = /<url>[\s\S]*?<loc>(.+?)<\/loc>[\s\S]*?<\/url>/gi;
312
+ const matches = xml.matchAll(urlPattern);
313
+
314
+ for (const match of matches) {
315
+ urls.push(match[1]);
316
+ }
317
+
318
+ return urls;
319
+ }
320
+
321
+ /**
322
+ * Fetch content from URL with markdown variant detection
323
+ * @param {string} urlStr - URL to fetch
324
+ * @param {boolean} forceExtract - Skip markdown variant detection
325
+ * @returns {Promise<{content: string, title: string, description: string, status: number, error?: string, fetchCount: number, publishedDate?: string}>}
326
+ */
327
+ async function fetchUrlContent(urlStr, forceExtract = false) {
328
+ let title = "";
329
+ let description = "";
330
+ let content = "";
331
+ let error;
332
+ let status = 0;
333
+ let fetchCount = 0;
334
+ let publishedDate = "";
335
+
336
+ if (forceExtract) {
337
+ // Just fetch HTML for metadata when forcing extract
338
+ try {
339
+ const res = await fetch(urlStr, {
340
+ headers: {
341
+ Accept: "text/html",
342
+ "User-Agent": "sitemap-to-llmtext-bot/1.0",
343
+ },
344
+ });
345
+ fetchCount++;
346
+ status = res.status;
347
+
348
+ if (res.ok) {
349
+ const html = await res.text();
350
+ ({ title, description, publishedDate } = extractMetadata(html));
351
+ }
352
+ } catch (err) {
353
+ error = `HTML fetch failed: ${err.message || "Unknown"}`;
354
+ }
355
+
356
+ return {
357
+ content,
358
+ title,
359
+ description,
360
+ status,
361
+ error,
362
+ fetchCount,
363
+ publishedDate,
364
+ };
365
+ }
366
+
367
+ // First, fetch HTML to check for markdown variants
368
+ let html = "";
369
+ try {
370
+ const htmlRes = await fetch(urlStr, {
371
+ headers: {
372
+ Accept: "text/html",
373
+ "User-Agent": "sitemap-to-llmtext-bot/1.0",
374
+ },
375
+ });
376
+ fetchCount++;
377
+ status = htmlRes.status;
378
+
379
+ if (htmlRes.ok) {
380
+ html = await htmlRes.text();
381
+ ({ title, description, publishedDate } = extractMetadata(html));
382
+
383
+ // Look for markdown alternate link
384
+ const mdAlternateMatch = html.match(
385
+ /<link\s+rel=["']alternate["']\s+type=["']text\/markdown["']\s+href=["']([^"']+)["'][^>]*>/i
386
+ );
387
+
388
+ if (mdAlternateMatch) {
389
+ const mdUrl = new URL(mdAlternateMatch[1], urlStr).href;
390
+ try {
391
+ const mdRes = await fetch(mdUrl, {
392
+ headers: {
393
+ Accept: "text/markdown, text/plain",
394
+ "User-Agent": "sitemap-to-llmtext-bot/1.0",
395
+ },
396
+ });
397
+ fetchCount++;
398
+
399
+ if (mdRes.ok) {
400
+ content = await mdRes.text();
401
+ return {
402
+ content,
403
+ title,
404
+ description,
405
+ status,
406
+ fetchCount,
407
+ publishedDate,
408
+ };
409
+ }
410
+ } catch (mdErr) {
411
+ // Fall through to try direct markdown request
412
+ }
413
+ }
414
+ }
415
+ } catch (err) {
416
+ error = `HTML fetch failed: ${err.message || "Unknown"}`;
417
+ }
418
+
419
+ // Try fetching with markdown accept header
420
+ try {
421
+ const mdRes = await fetch(urlStr, {
422
+ headers: {
423
+ Accept: "text/markdown",
424
+ "User-Agent": "sitemap-to-llmtext-bot/1.0",
425
+ },
426
+ });
427
+ fetchCount++;
428
+ status = status || mdRes.status;
429
+
430
+ const contentType = mdRes.headers.get("content-type") || "";
431
+ if (mdRes.ok && contentType.includes("markdown")) {
432
+ content = await mdRes.text();
433
+ }
434
+ } catch (mdErr) {
435
+ if (!error) {
436
+ error = `Markdown fetch failed: ${mdErr.message || "Unknown"}`;
437
+ }
438
+ }
439
+
440
+ return {
441
+ content,
442
+ title,
443
+ description,
444
+ status,
445
+ error,
446
+ fetchCount,
447
+ publishedDate,
448
+ };
449
+ }
450
+
451
+ /**
452
+ * Extract metadata from HTML
453
+ * @param {string} html - HTML content
454
+ * @returns {{title: string, description: string, publishedDate: string}}
455
+ */
456
+ function extractMetadata(html) {
457
+ let title = "";
458
+ let description = "";
459
+ let publishedDate = "";
460
+
461
+ // Extract title
462
+ const titleMatch = html.match(/<title>([^<]+)<\/title>/i);
463
+ if (titleMatch) {
464
+ title = titleMatch[1].trim();
465
+ }
466
+
467
+ // Extract og:description
468
+ const ogDescMatch = html.match(
469
+ /<meta\s+property=["']og:description["']\s+content=["']([^"']+)["']/i
470
+ );
471
+ if (ogDescMatch) {
472
+ description = ogDescMatch[1].trim();
473
+ }
474
+
475
+ // Fallback to meta description
476
+ if (!description) {
477
+ const metaDescMatch = html.match(
478
+ /<meta\s+name=["']description["']\s+content=["']([^"']+)["']/i
479
+ );
480
+ if (metaDescMatch) {
481
+ description = metaDescMatch[1].trim();
482
+ }
483
+ }
484
+
485
+ // Extract published date from various meta tags
486
+ const datePatterns = [
487
+ /<meta\s+property=["']article:published_time["']\s+content=["']([^"']+)["']/i,
488
+ /<meta\s+name=["']date["']\s+content=["']([^"']+)["']/i,
489
+ /<meta\s+name=["']publish-date["']\s+content=["']([^"']+)["']/i,
490
+ ];
491
+
492
+ for (const pattern of datePatterns) {
493
+ const match = html.match(pattern);
494
+ if (match) {
495
+ publishedDate = match[1].trim();
496
+ break;
497
+ }
498
+ }
499
+
500
+ return { title, description, publishedDate };
501
+ }
502
+
503
+ /**
504
+ * Convert URL to file path
505
+ * @param {string} urlStr - URL to convert
506
+ * @returns {string} File path
507
+ */
508
+ function getPathFromUrl(urlStr) {
509
+ try {
510
+ const url = new URL(urlStr);
511
+ let path = url.pathname;
512
+
513
+ // Handle root path
514
+ if (path === "/" || path === "") {
515
+ return "/index.html";
516
+ }
517
+
518
+ // Handle paths ending with /
519
+ if (path.endsWith("/")) {
520
+ path += "index.html";
521
+ }
522
+
523
+ return path;
524
+ } catch {
525
+ // Fallback to a sanitized version of the full URL
526
+ return "/" + urlStr.replace(/[^a-zA-Z0-9]/g, "_");
527
+ }
528
+ }
529
+
530
+ /**
531
+ * Generate llms.txt content
532
+ * @param {string} origin - Site origin
533
+ * @param {Record<string, any>} files - Files object
534
+ * @returns {string} Generated llms.txt content
535
+ */
536
+ function generateLlmsTxt(origin, files) {
537
+ // Find homepage for top-level description
538
+ const homepageFile = files["/index.html.md"] || files[Object.keys(files)[0]];
539
+ const siteTitle =
540
+ homepageFile?.title ||
541
+ new URL(origin.startsWith("http") ? origin : `https://${origin}`).hostname;
542
+ const siteDescription =
543
+ homepageFile?.description || `Documentation for ${siteTitle}`;
544
+
545
+ let llmsTxt = `# ${siteTitle}\n\n> ${siteDescription}\n\n`;
546
+
547
+ // Add documentation section
548
+ llmsTxt += "## Documentation\n\n";
549
+
550
+ // Sort files by path for consistent ordering
551
+ const sortedFiles = Object.entries(files)
552
+ .filter(([path]) => path !== "/llms.txt")
553
+ .sort(([a], [b]) => a.localeCompare(b));
554
+
555
+ for (const [path, file] of sortedFiles) {
556
+ if (file.content || file.title) {
557
+ const title = file.title || path.replace(".md", "");
558
+ const description = file.description ? `: ${file.description}` : "";
559
+ llmsTxt += `- [${title}](${path.replace(".md", "")}) (${
560
+ file.tokens
561
+ } tokens)${description}\n`;
562
+ }
563
+ }
564
+
565
+ return llmsTxt;
566
+ }
567
+
568
+ /**
569
+ * Call Parallel Extract API for multiple URLs
570
+ * @param {string[]} urls - URLs to extract
571
+ * @param {string} apiKey - Parallel API key
572
+ * @returns {Promise<{results: Array<{url: string, published_date: string, full_content: string|null, title: string|null}>, errors: Array<{url: string, message: string}>}>}
573
+ */
574
+ async function callParallelExtractAPI(urls, apiKey) {
575
+ const response = await fetch("https://api.parallel.ai/v1beta/extract", {
576
+ method: "POST",
577
+ headers: {
578
+ "Content-Type": "application/json",
579
+ "parallel-beta": "search-extract-2025-10-10",
580
+ "x-api-key": apiKey,
581
+ },
582
+ body: JSON.stringify({
583
+ urls,
584
+ full_content: true,
585
+ }),
586
+ });
587
+
588
+ if (!response.ok) {
589
+ throw new Error(
590
+ `Extract API failed: ${response.status} ${response.statusText}`
591
+ );
592
+ }
593
+
594
+ return await response.json();
595
+ }
package/package.json ADDED
@@ -0,0 +1,16 @@
1
+ {
2
+ "name": "extract-from-sitemap",
3
+ "bin": "cli.ts",
4
+ "version": "0.0.1",
5
+ "main": "mod.js",
6
+ "description": "A module and CLI that allows extracting all pages from a sitemap into markdown and a llms.txt, using Parallel.ai APIs.",
7
+ "files": [
8
+ "mod.js",
9
+ "cli.ts"
10
+ ],
11
+ "license": "MIT",
12
+ "devDependencies": {
13
+ "@cloudflare/workers-types": "4.20251011.0",
14
+ "@types/bun": "1.3.0"
15
+ }
16
+ }