extract-from-sitemap 0.0.10 → 0.0.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. package/cli.js +3 -2
  2. package/mod.js +23 -24
  3. package/package.json +1 -1
package/cli.js CHANGED
@@ -17,8 +17,8 @@ const { extractFromSitemap } = require("./mod.js");
17
17
  * @property {boolean} [forceExtract] - Whether to force extraction for this source
18
18
  * @property {boolean} [keepOriginalUrls] - Whether to keep original URL structure and not save files locally
19
19
  * @property {Array<{title: string, description: string, filename: string, url: string}>} [customUrls] - Custom URLs to extract for this source
20
+ * @property {string} [titleRemovePattern] - Regex pattern to remove from titles (case-insensitive)
20
21
  */
21
-
22
22
  /**
23
23
  * @typedef {Object} Config
24
24
  * @property {string} outDir - Top-level output directory for combined llms.txt
@@ -651,7 +651,8 @@ async function main() {
651
651
  const result = await extractFromSitemap(
652
652
  sourceConfig.origin,
653
653
  sourceConfig.forceExtract,
654
- apiKey
654
+ apiKey,
655
+ sourceConfig.titleRemovePattern
655
656
  );
656
657
 
657
658
  console.log(
package/mod.js CHANGED
@@ -27,9 +27,15 @@
27
27
  * @param {string} origin - The origin URL to extract from
28
28
  * @param {boolean} forceExtract - Whether to force using extract API instead of markdown variants
29
29
  * @param {string} apiKey - Parallel API key
30
+ * @param {string} [titleRemovePattern] - Optional regex pattern to remove from titles
30
31
  * @returns {Promise<ResponseData>}
31
32
  */
32
- export async function extractFromSitemap(origin, forceExtract = false, apiKey) {
33
+ export async function extractFromSitemap(
34
+ origin,
35
+ forceExtract = false,
36
+ apiKey,
37
+ titleRemovePattern
38
+ ) {
33
39
  const startTime = Date.now();
34
40
  let fetchCount = 0;
35
41
  let extractApiCallCount = 0;
@@ -57,7 +63,7 @@ export async function extractFromSitemap(origin, forceExtract = false, apiKey) {
57
63
  const path = getPathFromUrl(urlStr) + ".md";
58
64
  files[path] = {
59
65
  content: result.content,
60
- title: cleanTitle(result.title, origin),
66
+ title: cleanTitle(result.title, titleRemovePattern),
61
67
  description: cleanDescription(result.description, result.title),
62
68
  extracted: false,
63
69
  status: result.status,
@@ -117,7 +123,7 @@ export async function extractFromSitemap(origin, forceExtract = false, apiKey) {
117
123
  const content = result.full_content || existing.content;
118
124
  files[path] = {
119
125
  content,
120
- title: cleanTitle(result.title || existing.title, origin),
126
+ title: cleanTitle(result.title || existing.title, titleRemovePattern),
121
127
  description: cleanDescription(
122
128
  existing.description,
123
129
  result.title || existing.title
@@ -171,32 +177,25 @@ export async function extractFromSitemap(origin, forceExtract = false, apiKey) {
171
177
  }
172
178
 
173
179
  /**
174
- * Clean title by removing site name duplicates
180
+ * Clean title by removing custom pattern if provided
175
181
  * @param {string} title - Original title
176
- * @param {string} origin - Site origin
182
+ * @param {string} [removePattern] - Optional regex pattern to remove from title
177
183
  * @returns {string} Cleaned title
178
184
  */
179
- function cleanTitle(title, origin) {
185
+ function cleanTitle(title, removePattern) {
180
186
  if (!title) return "";
181
187
 
182
- // Extract domain name from origin
183
- const domain = new URL(origin).hostname.replace(/^www\./, "");
184
- const siteName = domain.split(".")[0];
185
-
186
- // Remove common site name patterns from end of title
187
- const patterns = [
188
- new RegExp(`\\s*[-|•]\\s*${siteName}\\s*$`, "i"),
189
- new RegExp(`\\s*[-|•]\\s*${domain}\\s*$`, "i"),
190
- /\s*[-|•]\s*Home\s*$/i,
191
- /\s*[-|•]\s*Documentation\s*$/i,
192
- ];
193
-
194
- let cleaned = title;
195
- for (const pattern of patterns) {
196
- cleaned = cleaned.replace(pattern, "");
188
+ if (removePattern) {
189
+ try {
190
+ const regex = new RegExp(removePattern, "gi");
191
+ return title.replace(regex, "").trim();
192
+ } catch (error) {
193
+ console.warn(`Invalid titleRemovePattern: ${error.message}`);
194
+ return title.trim();
195
+ }
197
196
  }
198
197
 
199
- return cleaned.trim();
198
+ return title.trim();
200
199
  }
201
200
 
202
201
  /**
@@ -505,12 +504,12 @@ function getPathFromUrl(urlStr) {
505
504
 
506
505
  // Handle root path
507
506
  if (path === "/" || path === "") {
508
- return "/index.html";
507
+ return "/index";
509
508
  }
510
509
 
511
510
  // Handle paths ending with /
512
511
  if (path.endsWith("/")) {
513
- path += "index.html";
512
+ path += "index";
514
513
  }
515
514
 
516
515
  return path;
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "extract-from-sitemap",
3
3
  "bin": "cli.js",
4
- "version": "0.0.10",
4
+ "version": "0.0.12",
5
5
  "main": "mod.js",
6
6
  "description": "A module and CLI that allows extracting all pages from a sitemap into markdown and a llms.txt, using Parallel.ai APIs.",
7
7
  "files": [