extract-from-sitemap 0.0.10 → 0.0.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/cli.js +3 -2
- package/mod.js +23 -24
- package/package.json +1 -1
package/cli.js
CHANGED
|
@@ -17,8 +17,8 @@ const { extractFromSitemap } = require("./mod.js");
|
|
|
17
17
|
* @property {boolean} [forceExtract] - Whether to force extraction for this source
|
|
18
18
|
* @property {boolean} [keepOriginalUrls] - Whether to keep original URL structure and not save files locally
|
|
19
19
|
* @property {Array<{title: string, description: string, filename: string, url: string}>} [customUrls] - Custom URLs to extract for this source
|
|
20
|
+
* @property {string} [titleRemovePattern] - Regex pattern to remove from titles (case-insensitive)
|
|
20
21
|
*/
|
|
21
|
-
|
|
22
22
|
/**
|
|
23
23
|
* @typedef {Object} Config
|
|
24
24
|
* @property {string} outDir - Top-level output directory for combined llms.txt
|
|
@@ -651,7 +651,8 @@ async function main() {
|
|
|
651
651
|
const result = await extractFromSitemap(
|
|
652
652
|
sourceConfig.origin,
|
|
653
653
|
sourceConfig.forceExtract,
|
|
654
|
-
apiKey
|
|
654
|
+
apiKey,
|
|
655
|
+
sourceConfig.titleRemovePattern
|
|
655
656
|
);
|
|
656
657
|
|
|
657
658
|
console.log(
|
package/mod.js
CHANGED
|
@@ -27,9 +27,15 @@
|
|
|
27
27
|
* @param {string} origin - The origin URL to extract from
|
|
28
28
|
* @param {boolean} forceExtract - Whether to force using extract API instead of markdown variants
|
|
29
29
|
* @param {string} apiKey - Parallel API key
|
|
30
|
+
* @param {string} [titleRemovePattern] - Optional regex pattern to remove from titles
|
|
30
31
|
* @returns {Promise<ResponseData>}
|
|
31
32
|
*/
|
|
32
|
-
export async function extractFromSitemap(
|
|
33
|
+
export async function extractFromSitemap(
|
|
34
|
+
origin,
|
|
35
|
+
forceExtract = false,
|
|
36
|
+
apiKey,
|
|
37
|
+
titleRemovePattern
|
|
38
|
+
) {
|
|
33
39
|
const startTime = Date.now();
|
|
34
40
|
let fetchCount = 0;
|
|
35
41
|
let extractApiCallCount = 0;
|
|
@@ -57,7 +63,7 @@ export async function extractFromSitemap(origin, forceExtract = false, apiKey) {
|
|
|
57
63
|
const path = getPathFromUrl(urlStr) + ".md";
|
|
58
64
|
files[path] = {
|
|
59
65
|
content: result.content,
|
|
60
|
-
title: cleanTitle(result.title,
|
|
66
|
+
title: cleanTitle(result.title, titleRemovePattern),
|
|
61
67
|
description: cleanDescription(result.description, result.title),
|
|
62
68
|
extracted: false,
|
|
63
69
|
status: result.status,
|
|
@@ -117,7 +123,7 @@ export async function extractFromSitemap(origin, forceExtract = false, apiKey) {
|
|
|
117
123
|
const content = result.full_content || existing.content;
|
|
118
124
|
files[path] = {
|
|
119
125
|
content,
|
|
120
|
-
title: cleanTitle(result.title || existing.title,
|
|
126
|
+
title: cleanTitle(result.title || existing.title, titleRemovePattern),
|
|
121
127
|
description: cleanDescription(
|
|
122
128
|
existing.description,
|
|
123
129
|
result.title || existing.title
|
|
@@ -171,32 +177,25 @@ export async function extractFromSitemap(origin, forceExtract = false, apiKey) {
|
|
|
171
177
|
}
|
|
172
178
|
|
|
173
179
|
/**
|
|
174
|
-
* Clean title by removing
|
|
180
|
+
* Clean title by removing custom pattern if provided
|
|
175
181
|
* @param {string} title - Original title
|
|
176
|
-
* @param {string}
|
|
182
|
+
* @param {string} [removePattern] - Optional regex pattern to remove from title
|
|
177
183
|
* @returns {string} Cleaned title
|
|
178
184
|
*/
|
|
179
|
-
function cleanTitle(title,
|
|
185
|
+
function cleanTitle(title, removePattern) {
|
|
180
186
|
if (!title) return "";
|
|
181
187
|
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
/\s*[-|•]\s*Home\s*$/i,
|
|
191
|
-
/\s*[-|•]\s*Documentation\s*$/i,
|
|
192
|
-
];
|
|
193
|
-
|
|
194
|
-
let cleaned = title;
|
|
195
|
-
for (const pattern of patterns) {
|
|
196
|
-
cleaned = cleaned.replace(pattern, "");
|
|
188
|
+
if (removePattern) {
|
|
189
|
+
try {
|
|
190
|
+
const regex = new RegExp(removePattern, "gi");
|
|
191
|
+
return title.replace(regex, "").trim();
|
|
192
|
+
} catch (error) {
|
|
193
|
+
console.warn(`Invalid titleRemovePattern: ${error.message}`);
|
|
194
|
+
return title.trim();
|
|
195
|
+
}
|
|
197
196
|
}
|
|
198
197
|
|
|
199
|
-
return
|
|
198
|
+
return title.trim();
|
|
200
199
|
}
|
|
201
200
|
|
|
202
201
|
/**
|
|
@@ -505,12 +504,12 @@ function getPathFromUrl(urlStr) {
|
|
|
505
504
|
|
|
506
505
|
// Handle root path
|
|
507
506
|
if (path === "/" || path === "") {
|
|
508
|
-
return "/index
|
|
507
|
+
return "/index";
|
|
509
508
|
}
|
|
510
509
|
|
|
511
510
|
// Handle paths ending with /
|
|
512
511
|
if (path.endsWith("/")) {
|
|
513
|
-
path += "index
|
|
512
|
+
path += "index";
|
|
514
513
|
}
|
|
515
514
|
|
|
516
515
|
return path;
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "extract-from-sitemap",
|
|
3
3
|
"bin": "cli.js",
|
|
4
|
-
"version": "0.0.
|
|
4
|
+
"version": "0.0.12",
|
|
5
5
|
"main": "mod.js",
|
|
6
6
|
"description": "A module and CLI that allows extracting all pages from a sitemap into markdown and a llms.txt, using Parallel.ai APIs.",
|
|
7
7
|
"files": [
|