extract-from-sitemap 0.0.20 → 0.0.22

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/mod.js +257 -16
  2. package/package.json +4 -1
package/mod.js CHANGED
@@ -1,3 +1,5 @@
1
+ import { parseLlmsTxt } from "parse-llms-txt";
2
+
1
3
  /**
2
4
  * @typedef {Object} FileResult
3
5
  * @property {string} [error] - Error message if file processing failed
@@ -56,8 +58,228 @@
56
58
  * @property {string} pathPrefix - Path prefix for links
57
59
  */
58
60
 
61
+ /**
62
+ * Fetch llms.txt from origin if available
63
+ * @param {string} origin - The origin to check for llms.txt
64
+ * @returns {Promise<string|null>} The llms.txt content or null if not found
65
+ */
66
+ async function fetchLlmsTxt(origin) {
67
+ const baseUrl = origin.startsWith("http") ? origin : `https://${origin}`;
68
+ const domain = new URL(baseUrl).origin;
69
+
70
+ try {
71
+ const res = await fetch(`${domain}/llms.txt`, {
72
+ headers: { "User-Agent": "sitemap-to-llmtext-bot/1.0" },
73
+ });
74
+
75
+ if (res.ok) {
76
+ const contentType = res.headers.get("content-type") || "";
77
+ if (contentType.includes("text")) {
78
+ return await res.text();
79
+ }
80
+ }
81
+ } catch {}
82
+
83
+ return null;
84
+ }
85
+
86
+ /**
87
+ * Extract content using llms.txt entries
88
+ * @param {import("../llms-txt-parse/mod.js").LlmsTxtFile} llmsTxt - Parsed llms.txt structure
89
+ * @param {string} origin - The origin URL for resolving relative URLs
90
+ * @param {string} apiKey - Parallel API key
91
+ * @param {string} [titleRemovePattern] - Optional regex pattern to remove from titles
92
+ * @returns {Promise<ResponseData>}
93
+ */
94
+ async function extractFromLlmsTxtEntries(
95
+ llmsTxt,
96
+ origin,
97
+ apiKey,
98
+ titleRemovePattern,
99
+ ) {
100
+ const startTime = Date.now();
101
+ let fetchCount = 0;
102
+ let extractApiCallCount = 0;
103
+
104
+ const files = {};
105
+ const urlsNeedingExtract = [];
106
+
107
+ const baseUrl = origin.startsWith("http") ? origin : `https://${origin}`;
108
+ const domain = new URL(baseUrl).origin;
109
+
110
+ // Collect all file entries from all sections
111
+ const allEntries = llmsTxt.sections.flatMap((section) =>
112
+ section.files.map((file) => ({
113
+ ...file,
114
+ section: section.name,
115
+ })),
116
+ );
117
+
118
+ // Fetch all URLs from llms.txt entries
119
+ await Promise.all(
120
+ allEntries.map(async (entry) => {
121
+ // Resolve relative URLs against origin
122
+ let resolvedUrl;
123
+ try {
124
+ resolvedUrl = new URL(entry.url, domain).href;
125
+ } catch {
126
+ resolvedUrl = entry.url;
127
+ }
128
+
129
+ try {
130
+ // Try fetching the markdown URL directly
131
+ const res = await fetch(resolvedUrl, {
132
+ headers: {
133
+ Accept: "text/markdown, text/plain, */*",
134
+ "User-Agent": "sitemap-to-llmtext-bot/1.0",
135
+ },
136
+ });
137
+ fetchCount++;
138
+
139
+ const path = getPathFromUrl(resolvedUrl);
140
+ // Ensure .md extension
141
+ const filePath = path.endsWith(".md") ? path : path + ".md";
142
+
143
+ if (res.ok) {
144
+ const content = await res.text();
145
+
146
+ // Check if we actually got markdown (not HTML)
147
+ const isMarkdown =
148
+ !content.trim().startsWith("<!DOCTYPE") &&
149
+ !content.trim().startsWith("<html");
150
+
151
+ if (isMarkdown && content.trim()) {
152
+ files[filePath] = {
153
+ content,
154
+ title: cleanTitle(entry.name, titleRemovePattern),
155
+ description: cleanDescription(entry.notes || "", entry.name),
156
+ extracted: false,
157
+ status: res.status,
158
+ tokens: Math.round(content.length / 5),
159
+ publishedDate: "",
160
+ originalUrl: resolvedUrl,
161
+ };
162
+ return;
163
+ }
164
+ }
165
+
166
+ // Mark for extraction fallback
167
+ files[filePath] = {
168
+ content: "",
169
+ title: cleanTitle(entry.name, titleRemovePattern),
170
+ description: cleanDescription(entry.notes || "", entry.name),
171
+ extracted: false,
172
+ status: res.status,
173
+ tokens: 0,
174
+ publishedDate: "",
175
+ originalUrl: resolvedUrl,
176
+ error: "Could not fetch markdown content",
177
+ };
178
+ urlsNeedingExtract.push(resolvedUrl);
179
+ } catch (error) {
180
+ const path = getPathFromUrl(resolvedUrl);
181
+ const filePath = path.endsWith(".md") ? path : path + ".md";
182
+ files[filePath] = {
183
+ error: error instanceof Error ? error.message : "Unknown error",
184
+ content: "",
185
+ title: cleanTitle(entry.name, titleRemovePattern),
186
+ description: cleanDescription(entry.notes || "", entry.name),
187
+ extracted: false,
188
+ status: 0,
189
+ tokens: 0,
190
+ publishedDate: "",
191
+ originalUrl: resolvedUrl,
192
+ };
193
+ urlsNeedingExtract.push(resolvedUrl);
194
+ }
195
+ }),
196
+ );
197
+
198
+ // Use Parallel Extract API for URLs that didn't return content
199
+ if (urlsNeedingExtract.length > 0 && apiKey) {
200
+ try {
201
+ extractApiCallCount = 1;
202
+ const extractResults = await callParallelExtractAPI(
203
+ urlsNeedingExtract,
204
+ apiKey,
205
+ );
206
+
207
+ // Merge extract results
208
+ for (const result of extractResults.results) {
209
+ const path = getPathFromUrl(result.url);
210
+ const filePath = path.endsWith(".md") ? path : path + ".md";
211
+ const existing = files[filePath] || {
212
+ content: "",
213
+ title: "",
214
+ description: "",
215
+ extracted: false,
216
+ status: 0,
217
+ tokens: 0,
218
+ publishedDate: "",
219
+ originalUrl: result.url,
220
+ };
221
+
222
+ const content = result.full_content || existing.content;
223
+ files[filePath] = {
224
+ content,
225
+ title: cleanTitle(result.title || existing.title, titleRemovePattern),
226
+ description: cleanDescription(
227
+ existing.description,
228
+ result.title || existing.title,
229
+ ),
230
+ extracted: !!result.full_content,
231
+ publishedDate: result.published_date || existing.publishedDate,
232
+ status: existing.status,
233
+ tokens: Math.round(content.length / 5),
234
+ originalUrl: existing.originalUrl,
235
+ };
236
+ }
237
+
238
+ // Handle extract errors
239
+ for (const error of extractResults.errors) {
240
+ const path = getPathFromUrl(error.url);
241
+ const filePath = path.endsWith(".md") ? path : path + ".md";
242
+ if (files[filePath]) {
243
+ files[filePath].error = error.message;
244
+ }
245
+ }
246
+ } catch (error) {
247
+ console.error("Extract API error:", error);
248
+ }
249
+ }
250
+
251
+ // Sort files by path
252
+ const sortedFiles = Object.keys(files)
253
+ .sort()
254
+ .reduce((acc, key) => {
255
+ acc[key] = files[key];
256
+ return acc;
257
+ }, {});
258
+
259
+ // Calculate totals
260
+ const totalTokens = Object.values(sortedFiles).reduce(
261
+ (sum, file) => sum + file.tokens,
262
+ 0,
263
+ );
264
+ const totalPages = Object.keys(sortedFiles).length;
265
+ const errors = Object.values(sortedFiles).filter((file) => file.error).length;
266
+ const processingTimeMs = Date.now() - startTime;
267
+
268
+ return {
269
+ files: sortedFiles,
270
+ totalTokens,
271
+ totalPages,
272
+ errors,
273
+ processingTimeMs,
274
+ extractApiCallCount,
275
+ fetchCount,
276
+ usedLlmsTxt: true,
277
+ };
278
+ }
279
+
59
280
  /**
60
281
  * Extract content from sitemap URLs with markdown variant detection
282
+ * Tries llms.txt first if available, then falls back to sitemap
61
283
  * @param {string} origin - The origin URL to extract from
62
284
  * @param {boolean} forceExtract - Whether to force using extract API instead of markdown variants
63
285
  * @param {string} apiKey - Parallel API key
@@ -68,16 +290,35 @@ export async function extractFromSitemap(
68
290
  origin,
69
291
  forceExtract = false,
70
292
  apiKey,
71
- titleRemovePattern
293
+ titleRemovePattern,
72
294
  ) {
73
295
  const startTime = Date.now();
74
296
  let fetchCount = 0;
75
297
  let extractApiCallCount = 0;
76
298
 
77
- // Discover sitemap
299
+ // Try llms.txt first
300
+ const llmsTxtContent = await fetchLlmsTxt(origin);
301
+ if (llmsTxtContent) {
302
+ const llmsTxt = parseLlmsTxt(llmsTxtContent);
303
+ const totalEntries = llmsTxt.sections.reduce(
304
+ (sum, section) => sum + section.files.length,
305
+ 0,
306
+ );
307
+ if (totalEntries > 0) {
308
+ console.log(`Found llms.txt with ${totalEntries} entries for ${origin}`);
309
+ return extractFromLlmsTxtEntries(
310
+ llmsTxt,
311
+ origin,
312
+ apiKey,
313
+ titleRemovePattern,
314
+ );
315
+ }
316
+ }
317
+
318
+ // Fall back to sitemap discovery
78
319
  const sitemapUrl = await discoverSitemap(origin);
79
320
  if (!sitemapUrl) {
80
- throw new Error(`Could not find sitemap for ${origin}`);
321
+ throw new Error(`Could not find sitemap or llms.txt for ${origin}`);
81
322
  }
82
323
 
83
324
  // Parse sitemap and get URLs
@@ -128,7 +369,7 @@ export async function extractFromSitemap(
128
369
  urlsNeedingExtract.push(urlStr);
129
370
  }
130
371
  }
131
- })
372
+ }),
132
373
  );
133
374
 
134
375
  // Use Parallel Extract API for URLs that didn't return content
@@ -137,7 +378,7 @@ export async function extractFromSitemap(
137
378
  extractApiCallCount = 1;
138
379
  const extractResults = await callParallelExtractAPI(
139
380
  urlsNeedingExtract,
140
- apiKey
381
+ apiKey,
141
382
  );
142
383
 
143
384
  // Merge extract results
@@ -160,7 +401,7 @@ export async function extractFromSitemap(
160
401
  title: cleanTitle(result.title || existing.title, titleRemovePattern),
161
402
  description: cleanDescription(
162
403
  existing.description,
163
- result.title || existing.title
404
+ result.title || existing.title,
164
405
  ),
165
406
  extracted: !!result.full_content,
166
407
  publishedDate: result.published_date || existing.publishedDate,
@@ -193,7 +434,7 @@ export async function extractFromSitemap(
193
434
  // Calculate totals
194
435
  const totalTokens = Object.values(sortedFiles).reduce(
195
436
  (sum, file) => sum + file.tokens,
196
- 0
437
+ 0,
197
438
  );
198
439
  const totalPages = Object.keys(sortedFiles).length;
199
440
  const errors = Object.values(sortedFiles).filter((file) => file.error).length;
@@ -296,7 +537,7 @@ export async function processLLMTextConfig(config, apiKey) {
296
537
  sourceConfig.origin,
297
538
  sourceConfig.forceExtract || false,
298
539
  apiKey,
299
- sourceConfig.titleRemovePattern
540
+ sourceConfig.titleRemovePattern,
300
541
  );
301
542
 
302
543
  sourceFiles = result.files;
@@ -309,7 +550,7 @@ export async function processLLMTextConfig(config, apiKey) {
309
550
  if (sourceConfig.customUrls && sourceConfig.customUrls.length > 0) {
310
551
  const customFiles = await processCustomUrls(
311
552
  sourceConfig.customUrls,
312
- apiKey
553
+ apiKey,
313
554
  );
314
555
 
315
556
  // Merge custom files with sitemap files
@@ -382,7 +623,7 @@ export async function processLLMTextConfig(config, apiKey) {
382
623
  config.title,
383
624
  config.description,
384
625
  config.details,
385
- allSources
626
+ allSources,
386
627
  );
387
628
 
388
629
  fileHierarchy[`${config.outDir}/llms.txt`] = {
@@ -420,7 +661,7 @@ function generateCombinedLlmsTxt(title, description, details, allSources) {
420
661
 
421
662
  // Sort files by path for consistent ordering
422
663
  const sortedFiles = Object.entries(source.files).sort(([a], [b]) =>
423
- a.localeCompare(b)
664
+ a.localeCompare(b),
424
665
  );
425
666
 
426
667
  for (const [path, file] of sortedFiles) {
@@ -601,7 +842,7 @@ async function parseSitemap(sitemapUrl) {
601
842
  if (childSitemaps.length > 0) {
602
843
  // Recursively parse child sitemaps
603
844
  const childUrls = await Promise.all(
604
- childSitemaps.map((url) => parseSitemap(url))
845
+ childSitemaps.map((url) => parseSitemap(url)),
605
846
  );
606
847
  return childUrls.flat();
607
848
  }
@@ -681,7 +922,7 @@ async function fetchUrlContent(urlStr, forceExtract = false) {
681
922
 
682
923
  // Look for markdown alternate link
683
924
  const mdAlternateMatch = html.match(
684
- /<link\s+rel=["']alternate["']\s+type=["']text\/markdown["']\s+href=["']([^"']+)["'][^>]*>/i
925
+ /<link\s+rel=["']alternate["']\s+type=["']text\/markdown["']\s+href=["']([^"']+)["'][^>]*>/i,
685
926
  );
686
927
 
687
928
  if (mdAlternateMatch) {
@@ -765,7 +1006,7 @@ function extractMetadata(html) {
765
1006
 
766
1007
  // Extract og:description
767
1008
  const ogDescMatch = html.match(
768
- /<meta\s+property=["']og:description["']\s+content=["']([^"']+)["']/i
1009
+ /<meta\s+property=["']og:description["']\s+content=["']([^"']+)["']/i,
769
1010
  );
770
1011
  if (ogDescMatch) {
771
1012
  description = ogDescMatch[1].trim();
@@ -774,7 +1015,7 @@ function extractMetadata(html) {
774
1015
  // Fallback to meta description
775
1016
  if (!description) {
776
1017
  const metaDescMatch = html.match(
777
- /<meta\s+name=["']description["']\s+content=["']([^"']+)["']/i
1018
+ /<meta\s+name=["']description["']\s+content=["']([^"']+)["']/i,
778
1019
  );
779
1020
  if (metaDescMatch) {
780
1021
  description = metaDescMatch[1].trim();
@@ -848,7 +1089,7 @@ async function callParallelExtractAPI(urls, apiKey) {
848
1089
 
849
1090
  if (!response.ok) {
850
1091
  throw new Error(
851
- `Extract API failed: ${response.status} ${response.statusText}`
1092
+ `Extract API failed: ${response.status} ${response.statusText}`,
852
1093
  );
853
1094
  }
854
1095
 
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "extract-from-sitemap",
3
3
  "bin": "cli.js",
4
- "version": "0.0.20",
4
+ "version": "0.0.22",
5
5
  "main": "mod.js",
6
6
  "description": "A module and CLI that allows extracting all pages from a sitemap into markdown and a llms.txt, using Parallel.ai APIs.",
7
7
  "files": [
@@ -11,5 +11,8 @@
11
11
  "license": "MIT",
12
12
  "devDependencies": {
13
13
  "@cloudflare/workers-types": "4.20251011.0"
14
+ },
15
+ "dependencies": {
16
+ "parse-llms-txt": "^0.0.9"
14
17
  }
15
18
  }