@odavl/guardian 2.0.0 → 2.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +210 -210
- package/LICENSE +21 -21
- package/README.md +297 -184
- package/bin/guardian.js +2242 -2221
- package/config/README.md +59 -59
- package/config/guardian.config.json +54 -54
- package/config/guardian.policy.json +12 -12
- package/config/profiles/docs.yaml +18 -18
- package/config/profiles/ecommerce.yaml +17 -17
- package/config/profiles/landing-demo.yaml +16 -16
- package/config/profiles/marketing.yaml +18 -18
- package/config/profiles/saas.yaml +21 -21
- package/flows/example-login-flow.json +36 -36
- package/flows/example-signup-flow.json +44 -44
- package/package.json +124 -116
- package/policies/enterprise.json +12 -12
- package/policies/landing-demo.json +22 -22
- package/policies/saas.json +12 -12
- package/policies/startup.json +12 -12
- package/src/enterprise/audit-logger.js +166 -166
- package/src/enterprise/pdf-exporter.js +267 -267
- package/src/enterprise/rbac-gate.js +142 -142
- package/src/enterprise/rbac.js +239 -239
- package/src/enterprise/site-manager.js +180 -180
- package/src/founder/feedback-system.js +156 -156
- package/src/founder/founder-tracker.js +213 -213
- package/src/founder/usage-signals.js +141 -141
- package/src/guardian/action-hints.js +439 -439
- package/src/guardian/alert-ledger.js +121 -121
- package/src/guardian/artifact-sanitizer.js +56 -56
- package/src/guardian/attempt-engine.js +1069 -1029
- package/src/guardian/attempt-registry.js +267 -267
- package/src/guardian/attempt-relevance.js +106 -106
- package/src/guardian/attempt-reporter.js +513 -507
- package/src/guardian/attempt.js +274 -273
- package/src/guardian/attempts-filter.js +63 -63
- package/src/guardian/auto-attempt-builder.js +283 -283
- package/src/guardian/baseline-registry.js +177 -177
- package/src/guardian/baseline-reporter.js +143 -143
- package/src/guardian/baseline-storage.js +285 -285
- package/src/guardian/baseline.js +535 -534
- package/src/guardian/behavioral-signals.js +261 -261
- package/src/guardian/breakage-intelligence.js +224 -224
- package/src/guardian/browser-pool.js +131 -131
- package/src/guardian/browser.js +119 -119
- package/src/guardian/canonical-truth.js +308 -308
- package/src/guardian/ci-cli.js +121 -121
- package/src/guardian/ci-gate.js +96 -96
- package/src/guardian/ci-mode.js +15 -15
- package/src/guardian/ci-output.js +55 -38
- package/src/guardian/cli-summary.js +102 -102
- package/src/guardian/confidence-signals.js +251 -251
- package/src/guardian/config-loader.js +161 -161
- package/src/guardian/config-validator.js +285 -283
- package/src/guardian/coverage-model.js +239 -239
- package/src/guardian/coverage-packs.js +58 -58
- package/src/guardian/crawler.js +142 -142
- package/src/guardian/data-guardian-detector.js +189 -189
- package/src/guardian/decision-authority.js +746 -725
- package/src/guardian/detection-layers.js +271 -271
- package/src/guardian/determinism.js +146 -146
- package/src/guardian/discovery-engine.js +661 -661
- package/src/guardian/drift-detector.js +100 -100
- package/src/guardian/enhanced-html-reporter.js +522 -522
- package/src/guardian/env-guard.js +128 -127
- package/src/guardian/error-clarity.js +399 -399
- package/src/guardian/export-contract.js +196 -196
- package/src/guardian/fail-safe.js +212 -212
- package/src/guardian/failure-intelligence.js +173 -173
- package/src/guardian/failure-taxonomy.js +169 -169
- package/src/guardian/final-outcome.js +206 -206
- package/src/guardian/first-run-profile.js +89 -89
- package/src/guardian/first-run.js +65 -67
- package/src/guardian/flag-validator.js +111 -111
- package/src/guardian/flow-executor.js +641 -639
- package/src/guardian/flow-registry.js +67 -67
- package/src/guardian/honesty.js +394 -394
- package/src/guardian/html-reporter.js +416 -416
- package/src/guardian/human-intent-resolver.js +296 -296
- package/src/guardian/human-interaction-model.js +351 -351
- package/src/guardian/human-journey-context.js +184 -184
- package/src/guardian/human-navigator.js +544 -544
- package/src/guardian/human-reporter.js +435 -431
- package/src/guardian/index.js +226 -221
- package/src/guardian/init-command.js +143 -143
- package/src/guardian/intent-detector.js +148 -146
- package/src/guardian/journey-definitions.js +132 -132
- package/src/guardian/journey-scan-cli.js +142 -145
- package/src/guardian/journey-scanner.js +583 -583
- package/src/guardian/junit-reporter.js +281 -281
- package/src/guardian/language-detection.js +99 -99
- package/src/guardian/live-alert.js +56 -56
- package/src/guardian/live-baseline-compare.js +146 -146
- package/src/guardian/live-cli.js +95 -95
- package/src/guardian/live-guardian.js +210 -210
- package/src/guardian/live-scheduler-runner.js +137 -137
- package/src/guardian/live-scheduler-state.js +167 -168
- package/src/guardian/live-scheduler.js +146 -146
- package/src/guardian/live-state.js +110 -110
- package/src/guardian/market-criticality.js +335 -335
- package/src/guardian/market-reporter.js +577 -577
- package/src/guardian/network-trace.js +178 -178
- package/src/guardian/obs-logger.js +110 -110
- package/src/guardian/observed-capabilities.js +427 -427
- package/src/guardian/output-contract.js +154 -0
- package/src/guardian/output-readability.js +264 -264
- package/src/guardian/parallel-executor.js +116 -116
- package/src/guardian/path-safety.js +56 -56
- package/src/guardian/pattern-analyzer.js +348 -348
- package/src/guardian/policy.js +432 -434
- package/src/guardian/prelaunch-gate.js +193 -193
- package/src/guardian/prerequisite-checker.js +101 -101
- package/src/guardian/preset-loader.js +152 -157
- package/src/guardian/profile-loader.js +96 -96
- package/src/guardian/reality.js +3025 -2826
- package/src/guardian/realworld-scenarios.js +94 -94
- package/src/guardian/reporter.js +167 -167
- package/src/guardian/retry-policy.js +123 -123
- package/src/guardian/root-cause-analysis.js +171 -171
- package/src/guardian/rules-engine.js +558 -558
- package/src/guardian/run-artifacts.js +212 -212
- package/src/guardian/run-cleanup.js +207 -207
- package/src/guardian/run-export.js +522 -522
- package/src/guardian/run-latest.js +90 -90
- package/src/guardian/run-list.js +211 -211
- package/src/guardian/run-summary.js +20 -20
- package/src/guardian/runtime-root.js +246 -246
- package/src/guardian/safety.js +248 -248
- package/src/guardian/scan-presets.js +133 -149
- package/src/guardian/screenshot.js +152 -152
- package/src/guardian/secret-hygiene.js +44 -44
- package/src/guardian/selector-fallbacks.js +394 -394
- package/src/guardian/semantic-contact-detection.js +255 -255
- package/src/guardian/semantic-contact-finder.js +201 -201
- package/src/guardian/semantic-targets.js +234 -234
- package/src/guardian/site-intelligence.js +588 -588
- package/src/guardian/site-introspection.js +257 -257
- package/src/guardian/sitemap.js +225 -225
- package/src/guardian/smoke.js +283 -258
- package/src/guardian/snapshot-schema.js +177 -290
- package/src/guardian/snapshot.js +430 -397
- package/src/guardian/stability-scorer.js +169 -169
- package/src/guardian/success-evaluator.js +214 -214
- package/src/guardian/template-command.js +184 -184
- package/src/guardian/text-formatters.js +426 -426
- package/src/guardian/timeout-profiles.js +57 -57
- package/src/guardian/truth/attempt.contract.js +158 -0
- package/src/guardian/truth/decision.contract.js +275 -0
- package/src/guardian/truth/snapshot.contract.js +363 -0
- package/src/guardian/validators.js +323 -323
- package/src/guardian/verdict-card.js +474 -474
- package/src/guardian/verdict-clarity.js +298 -298
- package/src/guardian/verdict-policy.js +363 -363
- package/src/guardian/verdict.js +333 -333
- package/src/guardian/verdicts.js +79 -74
- package/src/guardian/visual-diff.js +247 -247
- package/src/guardian/wait-for-outcome.js +119 -119
- package/src/guardian/watch-runner.js +181 -181
- package/src/guardian/watchdog-diff.js +167 -167
- package/src/guardian/webhook.js +206 -206
- package/src/payments/stripe-checkout.js +169 -169
- package/src/plans/plan-definitions.js +148 -148
- package/src/plans/plan-manager.js +211 -211
- package/src/plans/usage-tracker.js +210 -210
- package/src/recipes/recipe-engine.js +188 -188
- package/src/recipes/recipe-failure-analysis.js +159 -159
- package/src/recipes/recipe-registry.js +134 -134
- package/src/recipes/recipe-runtime.js +507 -507
- package/src/recipes/recipe-store.js +410 -410
- package/SECURITY.md +0 -77
- package/VERSIONING.md +0 -100
- package/guardian-contract-v1.md +0 -502
package/src/guardian/sitemap.js
CHANGED
|
@@ -1,225 +1,225 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Guardian Sitemap Discovery Module
|
|
3
|
-
* Discovers URLs from robots.txt and sitemap.xml
|
|
4
|
-
*/
|
|
5
|
-
|
|
6
|
-
const https = require('https');
|
|
7
|
-
const http = require('http');
|
|
8
|
-
|
|
9
|
-
class GuardianSitemap {
|
|
10
|
-
constructor(options = {}) {
|
|
11
|
-
this.timeout = options.timeout || 10000; // 10 seconds timeout
|
|
12
|
-
this.maxUrls = options.maxUrls || 200; // Maximum URLs to extract
|
|
13
|
-
}
|
|
14
|
-
|
|
15
|
-
/**
|
|
16
|
-
* Fetch content from URL
|
|
17
|
-
* @param {string} url - URL to fetch
|
|
18
|
-
* @returns {Promise<string|null>} Content or null if failed
|
|
19
|
-
*/
|
|
20
|
-
async fetch(url) {
|
|
21
|
-
return new Promise((resolve) => {
|
|
22
|
-
try {
|
|
23
|
-
const urlObj = new URL(url);
|
|
24
|
-
const client = urlObj.protocol === 'https:' ? https : http;
|
|
25
|
-
|
|
26
|
-
const request = client.get(url, { timeout: this.timeout }, (response) => {
|
|
27
|
-
// Follow redirects
|
|
28
|
-
if (response.statusCode >= 300 && response.statusCode < 400 && response.headers.location) {
|
|
29
|
-
return this.fetch(response.headers.location).then(resolve);
|
|
30
|
-
}
|
|
31
|
-
|
|
32
|
-
if (response.statusCode !== 200) {
|
|
33
|
-
return resolve(null);
|
|
34
|
-
}
|
|
35
|
-
|
|
36
|
-
let data = '';
|
|
37
|
-
response.on('data', (chunk) => { data += chunk; });
|
|
38
|
-
response.on('end', () => resolve(data));
|
|
39
|
-
});
|
|
40
|
-
|
|
41
|
-
request.on('error', () => resolve(null));
|
|
42
|
-
request.on('timeout', () => {
|
|
43
|
-
request.destroy();
|
|
44
|
-
resolve(null);
|
|
45
|
-
});
|
|
46
|
-
} catch (
|
|
47
|
-
resolve(null);
|
|
48
|
-
}
|
|
49
|
-
});
|
|
50
|
-
}
|
|
51
|
-
|
|
52
|
-
/**
|
|
53
|
-
* Discover sitemap URLs from robots.txt
|
|
54
|
-
* @param {string} baseUrl - Base URL of the website
|
|
55
|
-
* @returns {Promise<string[]>} Array of sitemap URLs
|
|
56
|
-
*/
|
|
57
|
-
async discoverFromRobots(baseUrl) {
|
|
58
|
-
try {
|
|
59
|
-
const robotsUrl = new URL('/robots.txt', baseUrl).href;
|
|
60
|
-
const content = await this.fetch(robotsUrl);
|
|
61
|
-
|
|
62
|
-
if (!content) {
|
|
63
|
-
return [];
|
|
64
|
-
}
|
|
65
|
-
|
|
66
|
-
const sitemaps = [];
|
|
67
|
-
const lines = content.split('\n');
|
|
68
|
-
|
|
69
|
-
for (const line of lines) {
|
|
70
|
-
const trimmed = line.trim();
|
|
71
|
-
if (trimmed.toLowerCase().startsWith('sitemap:')) {
|
|
72
|
-
const sitemapUrl = trimmed.substring(8).trim();
|
|
73
|
-
if (sitemapUrl) {
|
|
74
|
-
sitemaps.push(sitemapUrl);
|
|
75
|
-
}
|
|
76
|
-
}
|
|
77
|
-
}
|
|
78
|
-
|
|
79
|
-
return sitemaps;
|
|
80
|
-
} catch (error) {
|
|
81
|
-
console.error(`❌ Failed to fetch robots.txt: ${error.message}`);
|
|
82
|
-
return [];
|
|
83
|
-
}
|
|
84
|
-
}
|
|
85
|
-
|
|
86
|
-
/**
|
|
87
|
-
* Parse sitemap XML and extract URLs
|
|
88
|
-
* @param {string} xml - Sitemap XML content
|
|
89
|
-
* @returns {string[]} Array of URLs
|
|
90
|
-
*/
|
|
91
|
-
parseSitemap(xml) {
|
|
92
|
-
try {
|
|
93
|
-
const urls = [];
|
|
94
|
-
|
|
95
|
-
// Simple regex to extract <loc> tags (works for most sitemaps)
|
|
96
|
-
const locRegex = /<loc>(.*?)<\/loc>/gi;
|
|
97
|
-
let match;
|
|
98
|
-
|
|
99
|
-
while ((match = locRegex.exec(xml)) !== null && urls.length < this.maxUrls) {
|
|
100
|
-
const url = match[1].trim();
|
|
101
|
-
if (url) {
|
|
102
|
-
urls.push(url);
|
|
103
|
-
}
|
|
104
|
-
}
|
|
105
|
-
|
|
106
|
-
return urls;
|
|
107
|
-
} catch (error) {
|
|
108
|
-
console.error(`❌ Failed to parse sitemap: ${error.message}`);
|
|
109
|
-
return [];
|
|
110
|
-
}
|
|
111
|
-
}
|
|
112
|
-
|
|
113
|
-
/**
|
|
114
|
-
* Check if URL is a sitemap index (contains other sitemaps)
|
|
115
|
-
* @param {string} xml - XML content
|
|
116
|
-
* @returns {boolean} True if sitemap index
|
|
117
|
-
*/
|
|
118
|
-
isSitemapIndex(xml) {
|
|
119
|
-
return xml.includes('<sitemapindex') || xml.includes('</sitemapindex>');
|
|
120
|
-
}
|
|
121
|
-
|
|
122
|
-
/**
|
|
123
|
-
* Discover all URLs from base URL (robots.txt + sitemaps)
|
|
124
|
-
* @param {string} baseUrl - Base URL of the website
|
|
125
|
-
* @returns {Promise<object>} Object with discovered URLs and stats
|
|
126
|
-
*/
|
|
127
|
-
async discover(baseUrl) {
|
|
128
|
-
const result = {
|
|
129
|
-
urls: [],
|
|
130
|
-
sitemapsChecked: 0,
|
|
131
|
-
source: 'none',
|
|
132
|
-
};
|
|
133
|
-
|
|
134
|
-
try {
|
|
135
|
-
// Step 1: Check robots.txt for sitemap URLs
|
|
136
|
-
console.log('🗺️ Checking robots.txt for sitemaps...');
|
|
137
|
-
const sitemapUrls = await this.discoverFromRobots(baseUrl);
|
|
138
|
-
|
|
139
|
-
if (sitemapUrls.length === 0) {
|
|
140
|
-
// Try default sitemap.xml location
|
|
141
|
-
console.log('🗺️ Trying default sitemap.xml...');
|
|
142
|
-
sitemapUrls.push(new URL('/sitemap.xml', baseUrl).href);
|
|
143
|
-
}
|
|
144
|
-
|
|
145
|
-
// Step 2: Fetch and parse each sitemap
|
|
146
|
-
for (const sitemapUrl of sitemapUrls) {
|
|
147
|
-
if (result.urls.length >= this.maxUrls) {
|
|
148
|
-
break;
|
|
149
|
-
}
|
|
150
|
-
|
|
151
|
-
console.log(`🗺️ Fetching sitemap: ${sitemapUrl}`);
|
|
152
|
-
const xml = await this.fetch(sitemapUrl);
|
|
153
|
-
|
|
154
|
-
if (!xml) {
|
|
155
|
-
continue;
|
|
156
|
-
}
|
|
157
|
-
|
|
158
|
-
result.sitemapsChecked++;
|
|
159
|
-
|
|
160
|
-
// Check if it's a sitemap index
|
|
161
|
-
if (this.isSitemapIndex(xml)) {
|
|
162
|
-
const childSitemaps = this.parseSitemap(xml);
|
|
163
|
-
console.log(`🗺️ Found sitemap index with ${childSitemaps.length} child sitemaps`);
|
|
164
|
-
|
|
165
|
-
// Fetch child sitemaps
|
|
166
|
-
for (const childUrl of childSitemaps) {
|
|
167
|
-
if (result.urls.length >= this.maxUrls) {
|
|
168
|
-
break;
|
|
169
|
-
}
|
|
170
|
-
|
|
171
|
-
const childXml = await this.fetch(childUrl);
|
|
172
|
-
if (childXml) {
|
|
173
|
-
const childUrls = this.parseSitemap(childXml);
|
|
174
|
-
result.urls.push(...childUrls.slice(0, this.maxUrls - result.urls.length));
|
|
175
|
-
result.sitemapsChecked++;
|
|
176
|
-
}
|
|
177
|
-
}
|
|
178
|
-
} else {
|
|
179
|
-
// Regular sitemap
|
|
180
|
-
const urls = this.parseSitemap(xml);
|
|
181
|
-
result.urls.push(...urls.slice(0, this.maxUrls - result.urls.length));
|
|
182
|
-
}
|
|
183
|
-
}
|
|
184
|
-
|
|
185
|
-
// Deduplicate URLs
|
|
186
|
-
result.urls = [...new Set(result.urls)];
|
|
187
|
-
|
|
188
|
-
if (result.urls.length > 0) {
|
|
189
|
-
result.source = 'sitemap';
|
|
190
|
-
console.log(`✅ Discovered ${result.urls.length} URLs from ${result.sitemapsChecked} sitemap(s)`);
|
|
191
|
-
} else {
|
|
192
|
-
console.log('⚠️ No URLs found in sitemaps');
|
|
193
|
-
}
|
|
194
|
-
|
|
195
|
-
return result;
|
|
196
|
-
} catch (error) {
|
|
197
|
-
console.error(`❌ Sitemap discovery failed: ${error.message}`);
|
|
198
|
-
return result;
|
|
199
|
-
}
|
|
200
|
-
}
|
|
201
|
-
|
|
202
|
-
/**
|
|
203
|
-
* Filter URLs to same origin only
|
|
204
|
-
* @param {string[]} urls - Array of URLs
|
|
205
|
-
* @param {string} baseUrl - Base URL to compare against
|
|
206
|
-
* @returns {string[]} Filtered URLs
|
|
207
|
-
*/
|
|
208
|
-
filterSameOrigin(urls, baseUrl) {
|
|
209
|
-
try {
|
|
210
|
-
const baseOrigin = new URL(baseUrl).origin;
|
|
211
|
-
|
|
212
|
-
return urls.filter(url => {
|
|
213
|
-
try {
|
|
214
|
-
return new URL(url).origin === baseOrigin;
|
|
215
|
-
} catch {
|
|
216
|
-
return false;
|
|
217
|
-
}
|
|
218
|
-
});
|
|
219
|
-
} catch (
|
|
220
|
-
return [];
|
|
221
|
-
}
|
|
222
|
-
}
|
|
223
|
-
}
|
|
224
|
-
|
|
225
|
-
module.exports = GuardianSitemap;
|
|
1
|
+
/**
|
|
2
|
+
* Guardian Sitemap Discovery Module
|
|
3
|
+
* Discovers URLs from robots.txt and sitemap.xml
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
const https = require('https');
|
|
7
|
+
const http = require('http');
|
|
8
|
+
|
|
9
|
+
class GuardianSitemap {
|
|
10
|
+
constructor(options = {}) {
|
|
11
|
+
this.timeout = options.timeout || 10000; // 10 seconds timeout
|
|
12
|
+
this.maxUrls = options.maxUrls || 200; // Maximum URLs to extract
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
/**
|
|
16
|
+
* Fetch content from URL
|
|
17
|
+
* @param {string} url - URL to fetch
|
|
18
|
+
* @returns {Promise<string|null>} Content or null if failed
|
|
19
|
+
*/
|
|
20
|
+
async fetch(url) {
|
|
21
|
+
return new Promise((resolve) => {
|
|
22
|
+
try {
|
|
23
|
+
const urlObj = new URL(url);
|
|
24
|
+
const client = urlObj.protocol === 'https:' ? https : http;
|
|
25
|
+
|
|
26
|
+
const request = client.get(url, { timeout: this.timeout }, (response) => {
|
|
27
|
+
// Follow redirects
|
|
28
|
+
if (response.statusCode >= 300 && response.statusCode < 400 && response.headers.location) {
|
|
29
|
+
return this.fetch(response.headers.location).then(resolve);
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
if (response.statusCode !== 200) {
|
|
33
|
+
return resolve(null);
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
let data = '';
|
|
37
|
+
response.on('data', (chunk) => { data += chunk; });
|
|
38
|
+
response.on('end', () => resolve(data));
|
|
39
|
+
});
|
|
40
|
+
|
|
41
|
+
request.on('error', () => resolve(null));
|
|
42
|
+
request.on('timeout', () => {
|
|
43
|
+
request.destroy();
|
|
44
|
+
resolve(null);
|
|
45
|
+
});
|
|
46
|
+
} catch (_error) {
|
|
47
|
+
resolve(null);
|
|
48
|
+
}
|
|
49
|
+
});
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
/**
|
|
53
|
+
* Discover sitemap URLs from robots.txt
|
|
54
|
+
* @param {string} baseUrl - Base URL of the website
|
|
55
|
+
* @returns {Promise<string[]>} Array of sitemap URLs
|
|
56
|
+
*/
|
|
57
|
+
async discoverFromRobots(baseUrl) {
|
|
58
|
+
try {
|
|
59
|
+
const robotsUrl = new URL('/robots.txt', baseUrl).href;
|
|
60
|
+
const content = await this.fetch(robotsUrl);
|
|
61
|
+
|
|
62
|
+
if (!content) {
|
|
63
|
+
return [];
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
const sitemaps = [];
|
|
67
|
+
const lines = content.split('\n');
|
|
68
|
+
|
|
69
|
+
for (const line of lines) {
|
|
70
|
+
const trimmed = line.trim();
|
|
71
|
+
if (trimmed.toLowerCase().startsWith('sitemap:')) {
|
|
72
|
+
const sitemapUrl = trimmed.substring(8).trim();
|
|
73
|
+
if (sitemapUrl) {
|
|
74
|
+
sitemaps.push(sitemapUrl);
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
return sitemaps;
|
|
80
|
+
} catch (error) {
|
|
81
|
+
console.error(`❌ Failed to fetch robots.txt: ${error.message}`);
|
|
82
|
+
return [];
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
/**
|
|
87
|
+
* Parse sitemap XML and extract URLs
|
|
88
|
+
* @param {string} xml - Sitemap XML content
|
|
89
|
+
* @returns {string[]} Array of URLs
|
|
90
|
+
*/
|
|
91
|
+
parseSitemap(xml) {
|
|
92
|
+
try {
|
|
93
|
+
const urls = [];
|
|
94
|
+
|
|
95
|
+
// Simple regex to extract <loc> tags (works for most sitemaps)
|
|
96
|
+
const locRegex = /<loc>(.*?)<\/loc>/gi;
|
|
97
|
+
let match;
|
|
98
|
+
|
|
99
|
+
while ((match = locRegex.exec(xml)) !== null && urls.length < this.maxUrls) {
|
|
100
|
+
const url = match[1].trim();
|
|
101
|
+
if (url) {
|
|
102
|
+
urls.push(url);
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
return urls;
|
|
107
|
+
} catch (error) {
|
|
108
|
+
console.error(`❌ Failed to parse sitemap: ${error.message}`);
|
|
109
|
+
return [];
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
/**
|
|
114
|
+
* Check if URL is a sitemap index (contains other sitemaps)
|
|
115
|
+
* @param {string} xml - XML content
|
|
116
|
+
* @returns {boolean} True if sitemap index
|
|
117
|
+
*/
|
|
118
|
+
isSitemapIndex(xml) {
|
|
119
|
+
return xml.includes('<sitemapindex') || xml.includes('</sitemapindex>');
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
/**
|
|
123
|
+
* Discover all URLs from base URL (robots.txt + sitemaps)
|
|
124
|
+
* @param {string} baseUrl - Base URL of the website
|
|
125
|
+
* @returns {Promise<object>} Object with discovered URLs and stats
|
|
126
|
+
*/
|
|
127
|
+
async discover(baseUrl) {
|
|
128
|
+
const result = {
|
|
129
|
+
urls: [],
|
|
130
|
+
sitemapsChecked: 0,
|
|
131
|
+
source: 'none',
|
|
132
|
+
};
|
|
133
|
+
|
|
134
|
+
try {
|
|
135
|
+
// Step 1: Check robots.txt for sitemap URLs
|
|
136
|
+
console.log('🗺️ Checking robots.txt for sitemaps...');
|
|
137
|
+
const sitemapUrls = await this.discoverFromRobots(baseUrl);
|
|
138
|
+
|
|
139
|
+
if (sitemapUrls.length === 0) {
|
|
140
|
+
// Try default sitemap.xml location
|
|
141
|
+
console.log('🗺️ Trying default sitemap.xml...');
|
|
142
|
+
sitemapUrls.push(new URL('/sitemap.xml', baseUrl).href);
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
// Step 2: Fetch and parse each sitemap
|
|
146
|
+
for (const sitemapUrl of sitemapUrls) {
|
|
147
|
+
if (result.urls.length >= this.maxUrls) {
|
|
148
|
+
break;
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
console.log(`🗺️ Fetching sitemap: ${sitemapUrl}`);
|
|
152
|
+
const xml = await this.fetch(sitemapUrl);
|
|
153
|
+
|
|
154
|
+
if (!xml) {
|
|
155
|
+
continue;
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
result.sitemapsChecked++;
|
|
159
|
+
|
|
160
|
+
// Check if it's a sitemap index
|
|
161
|
+
if (this.isSitemapIndex(xml)) {
|
|
162
|
+
const childSitemaps = this.parseSitemap(xml);
|
|
163
|
+
console.log(`🗺️ Found sitemap index with ${childSitemaps.length} child sitemaps`);
|
|
164
|
+
|
|
165
|
+
// Fetch child sitemaps
|
|
166
|
+
for (const childUrl of childSitemaps) {
|
|
167
|
+
if (result.urls.length >= this.maxUrls) {
|
|
168
|
+
break;
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
const childXml = await this.fetch(childUrl);
|
|
172
|
+
if (childXml) {
|
|
173
|
+
const childUrls = this.parseSitemap(childXml);
|
|
174
|
+
result.urls.push(...childUrls.slice(0, this.maxUrls - result.urls.length));
|
|
175
|
+
result.sitemapsChecked++;
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
} else {
|
|
179
|
+
// Regular sitemap
|
|
180
|
+
const urls = this.parseSitemap(xml);
|
|
181
|
+
result.urls.push(...urls.slice(0, this.maxUrls - result.urls.length));
|
|
182
|
+
}
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
// Deduplicate URLs
|
|
186
|
+
result.urls = [...new Set(result.urls)];
|
|
187
|
+
|
|
188
|
+
if (result.urls.length > 0) {
|
|
189
|
+
result.source = 'sitemap';
|
|
190
|
+
console.log(`✅ Discovered ${result.urls.length} URLs from ${result.sitemapsChecked} sitemap(s)`);
|
|
191
|
+
} else {
|
|
192
|
+
console.log('⚠️ No URLs found in sitemaps');
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
return result;
|
|
196
|
+
} catch (error) {
|
|
197
|
+
console.error(`❌ Sitemap discovery failed: ${error.message}`);
|
|
198
|
+
return result;
|
|
199
|
+
}
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
/**
|
|
203
|
+
* Filter URLs to same origin only
|
|
204
|
+
* @param {string[]} urls - Array of URLs
|
|
205
|
+
* @param {string} baseUrl - Base URL to compare against
|
|
206
|
+
* @returns {string[]} Filtered URLs
|
|
207
|
+
*/
|
|
208
|
+
filterSameOrigin(urls, baseUrl) {
|
|
209
|
+
try {
|
|
210
|
+
const baseOrigin = new URL(baseUrl).origin;
|
|
211
|
+
|
|
212
|
+
return urls.filter(url => {
|
|
213
|
+
try {
|
|
214
|
+
return new URL(url).origin === baseOrigin;
|
|
215
|
+
} catch {
|
|
216
|
+
return false;
|
|
217
|
+
}
|
|
218
|
+
});
|
|
219
|
+
} catch (_error) {
|
|
220
|
+
return [];
|
|
221
|
+
}
|
|
222
|
+
}
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
module.exports = GuardianSitemap;
|