extract-from-sitemap 0.0.9 → 0.0.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. package/cli.js +5 -4
  2. package/mod.js +21 -22
  3. package/package.json +1 -1
package/cli.js CHANGED
@@ -17,8 +17,8 @@ const { extractFromSitemap } = require("./mod.js");
17
17
  * @property {boolean} [forceExtract] - Whether to force extraction for this source
18
18
  * @property {boolean} [keepOriginalUrls] - Whether to keep original URL structure and not save files locally
19
19
  * @property {Array<{title: string, description: string, filename: string, url: string}>} [customUrls] - Custom URLs to extract for this source
20
+ * @property {string} [titleRemovePattern] - Regex pattern to remove from titles (case-insensitive)
20
21
  */
21
-
22
22
  /**
23
23
  * @typedef {Object} Config
24
24
  * @property {string} outDir - Top-level output directory for combined llms.txt
@@ -541,7 +541,7 @@ function getPathPrefix(topLevelOutDir, sourceOutDir) {
541
541
  }
542
542
 
543
543
  const relativePath = path.relative(resolvedTopLevel, resolvedSource);
544
- return relativePath ? relativePath + "/" : "";
544
+ return relativePath || "";
545
545
  }
546
546
 
547
547
  /**
@@ -571,7 +571,7 @@ function generateCombinedLlmsTxt(allSources) {
571
571
  if (source.keepOriginalUrls) {
572
572
  link = file.originalUrl;
573
573
  } else {
574
- link = source.pathPrefix + path;
574
+ link = source.pathPrefix + (path.startsWith("/") ? path : "/" + path);
575
575
  }
576
576
 
577
577
  combinedTxt += `- [${title}](${link}) (${file.tokens} tokens)${description}\n`;
@@ -651,7 +651,8 @@ async function main() {
651
651
  const result = await extractFromSitemap(
652
652
  sourceConfig.origin,
653
653
  sourceConfig.forceExtract,
654
- apiKey
654
+ apiKey,
655
+ sourceConfig.titleRemovePattern
655
656
  );
656
657
 
657
658
  console.log(
package/mod.js CHANGED
@@ -27,9 +27,15 @@
27
27
  * @param {string} origin - The origin URL to extract from
28
28
  * @param {boolean} forceExtract - Whether to force using extract API instead of markdown variants
29
29
  * @param {string} apiKey - Parallel API key
30
+ * @param {string} [titleRemovePattern] - Optional regex pattern to remove from titles
30
31
  * @returns {Promise<ResponseData>}
31
32
  */
32
- export async function extractFromSitemap(origin, forceExtract = false, apiKey) {
33
+ export async function extractFromSitemap(
34
+ origin,
35
+ forceExtract = false,
36
+ apiKey,
37
+ titleRemovePattern
38
+ ) {
33
39
  const startTime = Date.now();
34
40
  let fetchCount = 0;
35
41
  let extractApiCallCount = 0;
@@ -57,7 +63,7 @@ export async function extractFromSitemap(origin, forceExtract = false, apiKey) {
57
63
  const path = getPathFromUrl(urlStr) + ".md";
58
64
  files[path] = {
59
65
  content: result.content,
60
- title: cleanTitle(result.title, origin),
66
+ title: cleanTitle(result.title, titleRemovePattern),
61
67
  description: cleanDescription(result.description, result.title),
62
68
  extracted: false,
63
69
  status: result.status,
@@ -117,7 +123,7 @@ export async function extractFromSitemap(origin, forceExtract = false, apiKey) {
117
123
  const content = result.full_content || existing.content;
118
124
  files[path] = {
119
125
  content,
120
- title: cleanTitle(result.title || existing.title, origin),
126
+ title: cleanTitle(result.title || existing.title, titleRemovePattern),
121
127
  description: cleanDescription(
122
128
  existing.description,
123
129
  result.title || existing.title
@@ -171,32 +177,25 @@ export async function extractFromSitemap(origin, forceExtract = false, apiKey) {
171
177
  }
172
178
 
173
179
  /**
174
- * Clean title by removing site name duplicates
180
+ * Clean title by removing custom pattern if provided
175
181
  * @param {string} title - Original title
176
- * @param {string} origin - Site origin
182
+ * @param {string} [removePattern] - Optional regex pattern to remove from title
177
183
  * @returns {string} Cleaned title
178
184
  */
179
- function cleanTitle(title, origin) {
185
+ function cleanTitle(title, removePattern) {
180
186
  if (!title) return "";
181
187
 
182
- // Extract domain name from origin
183
- const domain = new URL(origin).hostname.replace(/^www\./, "");
184
- const siteName = domain.split(".")[0];
185
-
186
- // Remove common site name patterns from end of title
187
- const patterns = [
188
- new RegExp(`\\s*[-|•]\\s*${siteName}\\s*$`, "i"),
189
- new RegExp(`\\s*[-|•]\\s*${domain}\\s*$`, "i"),
190
- /\s*[-|•]\s*Home\s*$/i,
191
- /\s*[-|•]\s*Documentation\s*$/i,
192
- ];
193
-
194
- let cleaned = title;
195
- for (const pattern of patterns) {
196
- cleaned = cleaned.replace(pattern, "");
188
+ if (removePattern) {
189
+ try {
190
+ const regex = new RegExp(removePattern, "gi");
191
+ return title.replace(regex, "").trim();
192
+ } catch (error) {
193
+ console.warn(`Invalid titleRemovePattern: ${error.message}`);
194
+ return title.trim();
195
+ }
197
196
  }
198
197
 
199
- return cleaned.trim();
198
+ return title.trim();
200
199
  }
201
200
 
202
201
  /**
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "extract-from-sitemap",
3
3
  "bin": "cli.js",
4
- "version": "0.0.9",
4
+ "version": "0.0.11",
5
5
  "main": "mod.js",
6
6
  "description": "A module and CLI that allows extracting all pages from a sitemap into markdown and a llms.txt, using Parallel.ai APIs.",
7
7
  "files": [