@mdream/crawl 0.17.0 → 0.17.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -91,6 +91,12 @@ function isUrlExcluded(url, excludePatterns) {
91
91
  }
92
92
  }
93
93
  /**
94
+ * Check if a string is valid sitemap XML content (not an HTML page or other non-sitemap response)
95
+ */
96
+ function isValidSitemapXml(content) {
97
+ return content.includes("<urlset") || content.includes("<sitemapindex");
98
+ }
99
+ /**
94
100
  * Validate glob pattern syntax
95
101
  */
96
102
  function validateGlobPattern(pattern) {
@@ -175,6 +181,7 @@ async function loadSitemapWithoutRetries(sitemapUrl) {
175
181
  clearTimeout(timeoutId);
176
182
  if (!response.ok) throw new Error(`Sitemap not found: ${response.status}`);
177
183
  const xmlContent = await response.text();
184
+ if (!isValidSitemapXml(xmlContent)) throw new Error("Response is not a valid sitemap XML");
178
185
  if (xmlContent.includes("<sitemapindex")) {
179
186
  SITEMAP_INDEX_LOC_RE.lastIndex = 0;
180
187
  const childSitemaps = [];
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@mdream/crawl",
3
3
  "type": "module",
4
- "version": "0.17.0",
4
+ "version": "0.17.1",
5
5
  "description": "Mdream Crawl generates comprehensive llms.txt artifacts from a single URL, using mdream to convert HTML to Markdown.",
6
6
  "author": {
7
7
  "name": "Harlan Wilton",
@@ -56,7 +56,7 @@
56
56
  "pathe": "^2.0.3",
57
57
  "picomatch": "^4.0.3",
58
58
  "ufo": "^1.6.3",
59
- "mdream": "0.17.0"
59
+ "mdream": "0.17.1"
60
60
  },
61
61
  "devDependencies": {
62
62
  "@types/picomatch": "^4.0.2"