extract-from-sitemap 0.0.9 → 0.0.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/cli.js +5 -4
- package/mod.js +21 -22
- package/package.json +1 -1
package/cli.js
CHANGED
|
@@ -17,8 +17,8 @@ const { extractFromSitemap } = require("./mod.js");
|
|
|
17
17
|
* @property {boolean} [forceExtract] - Whether to force extraction for this source
|
|
18
18
|
* @property {boolean} [keepOriginalUrls] - Whether to keep original URL structure and not save files locally
|
|
19
19
|
* @property {Array<{title: string, description: string, filename: string, url: string}>} [customUrls] - Custom URLs to extract for this source
|
|
20
|
+
* @property {string} [titleRemovePattern] - Regex pattern to remove from titles (case-insensitive)
|
|
20
21
|
*/
|
|
21
|
-
|
|
22
22
|
/**
|
|
23
23
|
* @typedef {Object} Config
|
|
24
24
|
* @property {string} outDir - Top-level output directory for combined llms.txt
|
|
@@ -541,7 +541,7 @@ function getPathPrefix(topLevelOutDir, sourceOutDir) {
|
|
|
541
541
|
}
|
|
542
542
|
|
|
543
543
|
const relativePath = path.relative(resolvedTopLevel, resolvedSource);
|
|
544
|
-
return relativePath
|
|
544
|
+
return relativePath || "";
|
|
545
545
|
}
|
|
546
546
|
|
|
547
547
|
/**
|
|
@@ -571,7 +571,7 @@ function generateCombinedLlmsTxt(allSources) {
|
|
|
571
571
|
if (source.keepOriginalUrls) {
|
|
572
572
|
link = file.originalUrl;
|
|
573
573
|
} else {
|
|
574
|
-
link = source.pathPrefix + path;
|
|
574
|
+
link = source.pathPrefix + (path.startsWith("/") ? path : "/" + path);
|
|
575
575
|
}
|
|
576
576
|
|
|
577
577
|
combinedTxt += `- [${title}](${link}) (${file.tokens} tokens)${description}\n`;
|
|
@@ -651,7 +651,8 @@ async function main() {
|
|
|
651
651
|
const result = await extractFromSitemap(
|
|
652
652
|
sourceConfig.origin,
|
|
653
653
|
sourceConfig.forceExtract,
|
|
654
|
-
apiKey
|
|
654
|
+
apiKey,
|
|
655
|
+
sourceConfig.titleRemovePattern
|
|
655
656
|
);
|
|
656
657
|
|
|
657
658
|
console.log(
|
package/mod.js
CHANGED
|
@@ -27,9 +27,15 @@
|
|
|
27
27
|
* @param {string} origin - The origin URL to extract from
|
|
28
28
|
* @param {boolean} forceExtract - Whether to force using extract API instead of markdown variants
|
|
29
29
|
* @param {string} apiKey - Parallel API key
|
|
30
|
+
* @param {string} [titleRemovePattern] - Optional regex pattern to remove from titles
|
|
30
31
|
* @returns {Promise<ResponseData>}
|
|
31
32
|
*/
|
|
32
|
-
export async function extractFromSitemap(
|
|
33
|
+
export async function extractFromSitemap(
|
|
34
|
+
origin,
|
|
35
|
+
forceExtract = false,
|
|
36
|
+
apiKey,
|
|
37
|
+
titleRemovePattern
|
|
38
|
+
) {
|
|
33
39
|
const startTime = Date.now();
|
|
34
40
|
let fetchCount = 0;
|
|
35
41
|
let extractApiCallCount = 0;
|
|
@@ -57,7 +63,7 @@ export async function extractFromSitemap(origin, forceExtract = false, apiKey) {
|
|
|
57
63
|
const path = getPathFromUrl(urlStr) + ".md";
|
|
58
64
|
files[path] = {
|
|
59
65
|
content: result.content,
|
|
60
|
-
title: cleanTitle(result.title,
|
|
66
|
+
title: cleanTitle(result.title, titleRemovePattern),
|
|
61
67
|
description: cleanDescription(result.description, result.title),
|
|
62
68
|
extracted: false,
|
|
63
69
|
status: result.status,
|
|
@@ -117,7 +123,7 @@ export async function extractFromSitemap(origin, forceExtract = false, apiKey) {
|
|
|
117
123
|
const content = result.full_content || existing.content;
|
|
118
124
|
files[path] = {
|
|
119
125
|
content,
|
|
120
|
-
title: cleanTitle(result.title || existing.title,
|
|
126
|
+
title: cleanTitle(result.title || existing.title, titleRemovePattern),
|
|
121
127
|
description: cleanDescription(
|
|
122
128
|
existing.description,
|
|
123
129
|
result.title || existing.title
|
|
@@ -171,32 +177,25 @@ export async function extractFromSitemap(origin, forceExtract = false, apiKey) {
|
|
|
171
177
|
}
|
|
172
178
|
|
|
173
179
|
/**
|
|
174
|
-
* Clean title by removing
|
|
180
|
+
* Clean title by removing custom pattern if provided
|
|
175
181
|
* @param {string} title - Original title
|
|
176
|
-
* @param {string}
|
|
182
|
+
* @param {string} [removePattern] - Optional regex pattern to remove from title
|
|
177
183
|
* @returns {string} Cleaned title
|
|
178
184
|
*/
|
|
179
|
-
function cleanTitle(title,
|
|
185
|
+
function cleanTitle(title, removePattern) {
|
|
180
186
|
if (!title) return "";
|
|
181
187
|
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
/\s*[-|•]\s*Home\s*$/i,
|
|
191
|
-
/\s*[-|•]\s*Documentation\s*$/i,
|
|
192
|
-
];
|
|
193
|
-
|
|
194
|
-
let cleaned = title;
|
|
195
|
-
for (const pattern of patterns) {
|
|
196
|
-
cleaned = cleaned.replace(pattern, "");
|
|
188
|
+
if (removePattern) {
|
|
189
|
+
try {
|
|
190
|
+
const regex = new RegExp(removePattern, "gi");
|
|
191
|
+
return title.replace(regex, "").trim();
|
|
192
|
+
} catch (error) {
|
|
193
|
+
console.warn(`Invalid titleRemovePattern: ${error.message}`);
|
|
194
|
+
return title.trim();
|
|
195
|
+
}
|
|
197
196
|
}
|
|
198
197
|
|
|
199
|
-
return
|
|
198
|
+
return title.trim();
|
|
200
199
|
}
|
|
201
200
|
|
|
202
201
|
/**
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "extract-from-sitemap",
|
|
3
3
|
"bin": "cli.js",
|
|
4
|
-
"version": "0.0.
|
|
4
|
+
"version": "0.0.11",
|
|
5
5
|
"main": "mod.js",
|
|
6
6
|
"description": "A module and CLI that allows extracting all pages from a sitemap into markdown and a llms.txt, using Parallel.ai APIs.",
|
|
7
7
|
"files": [
|