extract-from-sitemap 0.0.20 → 0.0.22
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/mod.js +257 -16
- package/package.json +4 -1
package/mod.js
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
import { parseLlmsTxt } from "parse-llms-txt";
|
|
2
|
+
|
|
1
3
|
/**
|
|
2
4
|
* @typedef {Object} FileResult
|
|
3
5
|
* @property {string} [error] - Error message if file processing failed
|
|
@@ -56,8 +58,228 @@
|
|
|
56
58
|
* @property {string} pathPrefix - Path prefix for links
|
|
57
59
|
*/
|
|
58
60
|
|
|
61
|
+
/**
|
|
62
|
+
* Fetch llms.txt from origin if available
|
|
63
|
+
* @param {string} origin - The origin to check for llms.txt
|
|
64
|
+
* @returns {Promise<string|null>} The llms.txt content or null if not found
|
|
65
|
+
*/
|
|
66
|
+
async function fetchLlmsTxt(origin) {
|
|
67
|
+
const baseUrl = origin.startsWith("http") ? origin : `https://${origin}`;
|
|
68
|
+
const domain = new URL(baseUrl).origin;
|
|
69
|
+
|
|
70
|
+
try {
|
|
71
|
+
const res = await fetch(`${domain}/llms.txt`, {
|
|
72
|
+
headers: { "User-Agent": "sitemap-to-llmtext-bot/1.0" },
|
|
73
|
+
});
|
|
74
|
+
|
|
75
|
+
if (res.ok) {
|
|
76
|
+
const contentType = res.headers.get("content-type") || "";
|
|
77
|
+
if (contentType.includes("text")) {
|
|
78
|
+
return await res.text();
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
} catch {}
|
|
82
|
+
|
|
83
|
+
return null;
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
/**
|
|
87
|
+
* Extract content using llms.txt entries
|
|
88
|
+
* @param {import("../llms-txt-parse/mod.js").LlmsTxtFile} llmsTxt - Parsed llms.txt structure
|
|
89
|
+
* @param {string} origin - The origin URL for resolving relative URLs
|
|
90
|
+
* @param {string} apiKey - Parallel API key
|
|
91
|
+
* @param {string} [titleRemovePattern] - Optional regex pattern to remove from titles
|
|
92
|
+
* @returns {Promise<ResponseData>}
|
|
93
|
+
*/
|
|
94
|
+
async function extractFromLlmsTxtEntries(
|
|
95
|
+
llmsTxt,
|
|
96
|
+
origin,
|
|
97
|
+
apiKey,
|
|
98
|
+
titleRemovePattern,
|
|
99
|
+
) {
|
|
100
|
+
const startTime = Date.now();
|
|
101
|
+
let fetchCount = 0;
|
|
102
|
+
let extractApiCallCount = 0;
|
|
103
|
+
|
|
104
|
+
const files = {};
|
|
105
|
+
const urlsNeedingExtract = [];
|
|
106
|
+
|
|
107
|
+
const baseUrl = origin.startsWith("http") ? origin : `https://${origin}`;
|
|
108
|
+
const domain = new URL(baseUrl).origin;
|
|
109
|
+
|
|
110
|
+
// Collect all file entries from all sections
|
|
111
|
+
const allEntries = llmsTxt.sections.flatMap((section) =>
|
|
112
|
+
section.files.map((file) => ({
|
|
113
|
+
...file,
|
|
114
|
+
section: section.name,
|
|
115
|
+
})),
|
|
116
|
+
);
|
|
117
|
+
|
|
118
|
+
// Fetch all URLs from llms.txt entries
|
|
119
|
+
await Promise.all(
|
|
120
|
+
allEntries.map(async (entry) => {
|
|
121
|
+
// Resolve relative URLs against origin
|
|
122
|
+
let resolvedUrl;
|
|
123
|
+
try {
|
|
124
|
+
resolvedUrl = new URL(entry.url, domain).href;
|
|
125
|
+
} catch {
|
|
126
|
+
resolvedUrl = entry.url;
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
try {
|
|
130
|
+
// Try fetching the markdown URL directly
|
|
131
|
+
const res = await fetch(resolvedUrl, {
|
|
132
|
+
headers: {
|
|
133
|
+
Accept: "text/markdown, text/plain, */*",
|
|
134
|
+
"User-Agent": "sitemap-to-llmtext-bot/1.0",
|
|
135
|
+
},
|
|
136
|
+
});
|
|
137
|
+
fetchCount++;
|
|
138
|
+
|
|
139
|
+
const path = getPathFromUrl(resolvedUrl);
|
|
140
|
+
// Ensure .md extension
|
|
141
|
+
const filePath = path.endsWith(".md") ? path : path + ".md";
|
|
142
|
+
|
|
143
|
+
if (res.ok) {
|
|
144
|
+
const content = await res.text();
|
|
145
|
+
|
|
146
|
+
// Check if we actually got markdown (not HTML)
|
|
147
|
+
const isMarkdown =
|
|
148
|
+
!content.trim().startsWith("<!DOCTYPE") &&
|
|
149
|
+
!content.trim().startsWith("<html");
|
|
150
|
+
|
|
151
|
+
if (isMarkdown && content.trim()) {
|
|
152
|
+
files[filePath] = {
|
|
153
|
+
content,
|
|
154
|
+
title: cleanTitle(entry.name, titleRemovePattern),
|
|
155
|
+
description: cleanDescription(entry.notes || "", entry.name),
|
|
156
|
+
extracted: false,
|
|
157
|
+
status: res.status,
|
|
158
|
+
tokens: Math.round(content.length / 5),
|
|
159
|
+
publishedDate: "",
|
|
160
|
+
originalUrl: resolvedUrl,
|
|
161
|
+
};
|
|
162
|
+
return;
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
// Mark for extraction fallback
|
|
167
|
+
files[filePath] = {
|
|
168
|
+
content: "",
|
|
169
|
+
title: cleanTitle(entry.name, titleRemovePattern),
|
|
170
|
+
description: cleanDescription(entry.notes || "", entry.name),
|
|
171
|
+
extracted: false,
|
|
172
|
+
status: res.status,
|
|
173
|
+
tokens: 0,
|
|
174
|
+
publishedDate: "",
|
|
175
|
+
originalUrl: resolvedUrl,
|
|
176
|
+
error: "Could not fetch markdown content",
|
|
177
|
+
};
|
|
178
|
+
urlsNeedingExtract.push(resolvedUrl);
|
|
179
|
+
} catch (error) {
|
|
180
|
+
const path = getPathFromUrl(resolvedUrl);
|
|
181
|
+
const filePath = path.endsWith(".md") ? path : path + ".md";
|
|
182
|
+
files[filePath] = {
|
|
183
|
+
error: error instanceof Error ? error.message : "Unknown error",
|
|
184
|
+
content: "",
|
|
185
|
+
title: cleanTitle(entry.name, titleRemovePattern),
|
|
186
|
+
description: cleanDescription(entry.notes || "", entry.name),
|
|
187
|
+
extracted: false,
|
|
188
|
+
status: 0,
|
|
189
|
+
tokens: 0,
|
|
190
|
+
publishedDate: "",
|
|
191
|
+
originalUrl: resolvedUrl,
|
|
192
|
+
};
|
|
193
|
+
urlsNeedingExtract.push(resolvedUrl);
|
|
194
|
+
}
|
|
195
|
+
}),
|
|
196
|
+
);
|
|
197
|
+
|
|
198
|
+
// Use Parallel Extract API for URLs that didn't return content
|
|
199
|
+
if (urlsNeedingExtract.length > 0 && apiKey) {
|
|
200
|
+
try {
|
|
201
|
+
extractApiCallCount = 1;
|
|
202
|
+
const extractResults = await callParallelExtractAPI(
|
|
203
|
+
urlsNeedingExtract,
|
|
204
|
+
apiKey,
|
|
205
|
+
);
|
|
206
|
+
|
|
207
|
+
// Merge extract results
|
|
208
|
+
for (const result of extractResults.results) {
|
|
209
|
+
const path = getPathFromUrl(result.url);
|
|
210
|
+
const filePath = path.endsWith(".md") ? path : path + ".md";
|
|
211
|
+
const existing = files[filePath] || {
|
|
212
|
+
content: "",
|
|
213
|
+
title: "",
|
|
214
|
+
description: "",
|
|
215
|
+
extracted: false,
|
|
216
|
+
status: 0,
|
|
217
|
+
tokens: 0,
|
|
218
|
+
publishedDate: "",
|
|
219
|
+
originalUrl: result.url,
|
|
220
|
+
};
|
|
221
|
+
|
|
222
|
+
const content = result.full_content || existing.content;
|
|
223
|
+
files[filePath] = {
|
|
224
|
+
content,
|
|
225
|
+
title: cleanTitle(result.title || existing.title, titleRemovePattern),
|
|
226
|
+
description: cleanDescription(
|
|
227
|
+
existing.description,
|
|
228
|
+
result.title || existing.title,
|
|
229
|
+
),
|
|
230
|
+
extracted: !!result.full_content,
|
|
231
|
+
publishedDate: result.published_date || existing.publishedDate,
|
|
232
|
+
status: existing.status,
|
|
233
|
+
tokens: Math.round(content.length / 5),
|
|
234
|
+
originalUrl: existing.originalUrl,
|
|
235
|
+
};
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
// Handle extract errors
|
|
239
|
+
for (const error of extractResults.errors) {
|
|
240
|
+
const path = getPathFromUrl(error.url);
|
|
241
|
+
const filePath = path.endsWith(".md") ? path : path + ".md";
|
|
242
|
+
if (files[filePath]) {
|
|
243
|
+
files[filePath].error = error.message;
|
|
244
|
+
}
|
|
245
|
+
}
|
|
246
|
+
} catch (error) {
|
|
247
|
+
console.error("Extract API error:", error);
|
|
248
|
+
}
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
// Sort files by path
|
|
252
|
+
const sortedFiles = Object.keys(files)
|
|
253
|
+
.sort()
|
|
254
|
+
.reduce((acc, key) => {
|
|
255
|
+
acc[key] = files[key];
|
|
256
|
+
return acc;
|
|
257
|
+
}, {});
|
|
258
|
+
|
|
259
|
+
// Calculate totals
|
|
260
|
+
const totalTokens = Object.values(sortedFiles).reduce(
|
|
261
|
+
(sum, file) => sum + file.tokens,
|
|
262
|
+
0,
|
|
263
|
+
);
|
|
264
|
+
const totalPages = Object.keys(sortedFiles).length;
|
|
265
|
+
const errors = Object.values(sortedFiles).filter((file) => file.error).length;
|
|
266
|
+
const processingTimeMs = Date.now() - startTime;
|
|
267
|
+
|
|
268
|
+
return {
|
|
269
|
+
files: sortedFiles,
|
|
270
|
+
totalTokens,
|
|
271
|
+
totalPages,
|
|
272
|
+
errors,
|
|
273
|
+
processingTimeMs,
|
|
274
|
+
extractApiCallCount,
|
|
275
|
+
fetchCount,
|
|
276
|
+
usedLlmsTxt: true,
|
|
277
|
+
};
|
|
278
|
+
}
|
|
279
|
+
|
|
59
280
|
/**
|
|
60
281
|
* Extract content from sitemap URLs with markdown variant detection
|
|
282
|
+
* Tries llms.txt first if available, then falls back to sitemap
|
|
61
283
|
* @param {string} origin - The origin URL to extract from
|
|
62
284
|
* @param {boolean} forceExtract - Whether to force using extract API instead of markdown variants
|
|
63
285
|
* @param {string} apiKey - Parallel API key
|
|
@@ -68,16 +290,35 @@ export async function extractFromSitemap(
|
|
|
68
290
|
origin,
|
|
69
291
|
forceExtract = false,
|
|
70
292
|
apiKey,
|
|
71
|
-
titleRemovePattern
|
|
293
|
+
titleRemovePattern,
|
|
72
294
|
) {
|
|
73
295
|
const startTime = Date.now();
|
|
74
296
|
let fetchCount = 0;
|
|
75
297
|
let extractApiCallCount = 0;
|
|
76
298
|
|
|
77
|
-
//
|
|
299
|
+
// Try llms.txt first
|
|
300
|
+
const llmsTxtContent = await fetchLlmsTxt(origin);
|
|
301
|
+
if (llmsTxtContent) {
|
|
302
|
+
const llmsTxt = parseLlmsTxt(llmsTxtContent);
|
|
303
|
+
const totalEntries = llmsTxt.sections.reduce(
|
|
304
|
+
(sum, section) => sum + section.files.length,
|
|
305
|
+
0,
|
|
306
|
+
);
|
|
307
|
+
if (totalEntries > 0) {
|
|
308
|
+
console.log(`Found llms.txt with ${totalEntries} entries for ${origin}`);
|
|
309
|
+
return extractFromLlmsTxtEntries(
|
|
310
|
+
llmsTxt,
|
|
311
|
+
origin,
|
|
312
|
+
apiKey,
|
|
313
|
+
titleRemovePattern,
|
|
314
|
+
);
|
|
315
|
+
}
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
// Fall back to sitemap discovery
|
|
78
319
|
const sitemapUrl = await discoverSitemap(origin);
|
|
79
320
|
if (!sitemapUrl) {
|
|
80
|
-
throw new Error(`Could not find sitemap for ${origin}`);
|
|
321
|
+
throw new Error(`Could not find sitemap or llms.txt for ${origin}`);
|
|
81
322
|
}
|
|
82
323
|
|
|
83
324
|
// Parse sitemap and get URLs
|
|
@@ -128,7 +369,7 @@ export async function extractFromSitemap(
|
|
|
128
369
|
urlsNeedingExtract.push(urlStr);
|
|
129
370
|
}
|
|
130
371
|
}
|
|
131
|
-
})
|
|
372
|
+
}),
|
|
132
373
|
);
|
|
133
374
|
|
|
134
375
|
// Use Parallel Extract API for URLs that didn't return content
|
|
@@ -137,7 +378,7 @@ export async function extractFromSitemap(
|
|
|
137
378
|
extractApiCallCount = 1;
|
|
138
379
|
const extractResults = await callParallelExtractAPI(
|
|
139
380
|
urlsNeedingExtract,
|
|
140
|
-
apiKey
|
|
381
|
+
apiKey,
|
|
141
382
|
);
|
|
142
383
|
|
|
143
384
|
// Merge extract results
|
|
@@ -160,7 +401,7 @@ export async function extractFromSitemap(
|
|
|
160
401
|
title: cleanTitle(result.title || existing.title, titleRemovePattern),
|
|
161
402
|
description: cleanDescription(
|
|
162
403
|
existing.description,
|
|
163
|
-
result.title || existing.title
|
|
404
|
+
result.title || existing.title,
|
|
164
405
|
),
|
|
165
406
|
extracted: !!result.full_content,
|
|
166
407
|
publishedDate: result.published_date || existing.publishedDate,
|
|
@@ -193,7 +434,7 @@ export async function extractFromSitemap(
|
|
|
193
434
|
// Calculate totals
|
|
194
435
|
const totalTokens = Object.values(sortedFiles).reduce(
|
|
195
436
|
(sum, file) => sum + file.tokens,
|
|
196
|
-
0
|
|
437
|
+
0,
|
|
197
438
|
);
|
|
198
439
|
const totalPages = Object.keys(sortedFiles).length;
|
|
199
440
|
const errors = Object.values(sortedFiles).filter((file) => file.error).length;
|
|
@@ -296,7 +537,7 @@ export async function processLLMTextConfig(config, apiKey) {
|
|
|
296
537
|
sourceConfig.origin,
|
|
297
538
|
sourceConfig.forceExtract || false,
|
|
298
539
|
apiKey,
|
|
299
|
-
sourceConfig.titleRemovePattern
|
|
540
|
+
sourceConfig.titleRemovePattern,
|
|
300
541
|
);
|
|
301
542
|
|
|
302
543
|
sourceFiles = result.files;
|
|
@@ -309,7 +550,7 @@ export async function processLLMTextConfig(config, apiKey) {
|
|
|
309
550
|
if (sourceConfig.customUrls && sourceConfig.customUrls.length > 0) {
|
|
310
551
|
const customFiles = await processCustomUrls(
|
|
311
552
|
sourceConfig.customUrls,
|
|
312
|
-
apiKey
|
|
553
|
+
apiKey,
|
|
313
554
|
);
|
|
314
555
|
|
|
315
556
|
// Merge custom files with sitemap files
|
|
@@ -382,7 +623,7 @@ export async function processLLMTextConfig(config, apiKey) {
|
|
|
382
623
|
config.title,
|
|
383
624
|
config.description,
|
|
384
625
|
config.details,
|
|
385
|
-
allSources
|
|
626
|
+
allSources,
|
|
386
627
|
);
|
|
387
628
|
|
|
388
629
|
fileHierarchy[`${config.outDir}/llms.txt`] = {
|
|
@@ -420,7 +661,7 @@ function generateCombinedLlmsTxt(title, description, details, allSources) {
|
|
|
420
661
|
|
|
421
662
|
// Sort files by path for consistent ordering
|
|
422
663
|
const sortedFiles = Object.entries(source.files).sort(([a], [b]) =>
|
|
423
|
-
a.localeCompare(b)
|
|
664
|
+
a.localeCompare(b),
|
|
424
665
|
);
|
|
425
666
|
|
|
426
667
|
for (const [path, file] of sortedFiles) {
|
|
@@ -601,7 +842,7 @@ async function parseSitemap(sitemapUrl) {
|
|
|
601
842
|
if (childSitemaps.length > 0) {
|
|
602
843
|
// Recursively parse child sitemaps
|
|
603
844
|
const childUrls = await Promise.all(
|
|
604
|
-
childSitemaps.map((url) => parseSitemap(url))
|
|
845
|
+
childSitemaps.map((url) => parseSitemap(url)),
|
|
605
846
|
);
|
|
606
847
|
return childUrls.flat();
|
|
607
848
|
}
|
|
@@ -681,7 +922,7 @@ async function fetchUrlContent(urlStr, forceExtract = false) {
|
|
|
681
922
|
|
|
682
923
|
// Look for markdown alternate link
|
|
683
924
|
const mdAlternateMatch = html.match(
|
|
684
|
-
/<link\s+rel=["']alternate["']\s+type=["']text\/markdown["']\s+href=["']([^"']+)["'][^>]*>/i
|
|
925
|
+
/<link\s+rel=["']alternate["']\s+type=["']text\/markdown["']\s+href=["']([^"']+)["'][^>]*>/i,
|
|
685
926
|
);
|
|
686
927
|
|
|
687
928
|
if (mdAlternateMatch) {
|
|
@@ -765,7 +1006,7 @@ function extractMetadata(html) {
|
|
|
765
1006
|
|
|
766
1007
|
// Extract og:description
|
|
767
1008
|
const ogDescMatch = html.match(
|
|
768
|
-
/<meta\s+property=["']og:description["']\s+content=["']([^"']+)["']/i
|
|
1009
|
+
/<meta\s+property=["']og:description["']\s+content=["']([^"']+)["']/i,
|
|
769
1010
|
);
|
|
770
1011
|
if (ogDescMatch) {
|
|
771
1012
|
description = ogDescMatch[1].trim();
|
|
@@ -774,7 +1015,7 @@ function extractMetadata(html) {
|
|
|
774
1015
|
// Fallback to meta description
|
|
775
1016
|
if (!description) {
|
|
776
1017
|
const metaDescMatch = html.match(
|
|
777
|
-
/<meta\s+name=["']description["']\s+content=["']([^"']+)["']/i
|
|
1018
|
+
/<meta\s+name=["']description["']\s+content=["']([^"']+)["']/i,
|
|
778
1019
|
);
|
|
779
1020
|
if (metaDescMatch) {
|
|
780
1021
|
description = metaDescMatch[1].trim();
|
|
@@ -848,7 +1089,7 @@ async function callParallelExtractAPI(urls, apiKey) {
|
|
|
848
1089
|
|
|
849
1090
|
if (!response.ok) {
|
|
850
1091
|
throw new Error(
|
|
851
|
-
`Extract API failed: ${response.status} ${response.statusText}
|
|
1092
|
+
`Extract API failed: ${response.status} ${response.statusText}`,
|
|
852
1093
|
);
|
|
853
1094
|
}
|
|
854
1095
|
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "extract-from-sitemap",
|
|
3
3
|
"bin": "cli.js",
|
|
4
|
-
"version": "0.0.
|
|
4
|
+
"version": "0.0.22",
|
|
5
5
|
"main": "mod.js",
|
|
6
6
|
"description": "A module and CLI that allows extracting all pages from a sitemap into markdown and a llms.txt, using Parallel.ai APIs.",
|
|
7
7
|
"files": [
|
|
@@ -11,5 +11,8 @@
|
|
|
11
11
|
"license": "MIT",
|
|
12
12
|
"devDependencies": {
|
|
13
13
|
"@cloudflare/workers-types": "4.20251011.0"
|
|
14
|
+
},
|
|
15
|
+
"dependencies": {
|
|
16
|
+
"parse-llms-txt": "^0.0.9"
|
|
14
17
|
}
|
|
15
18
|
}
|