n8n-nodes-seo-scanner 1.2.32 → 1.2.34
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/SeoScanner.node.js +321 -19
- package/dist/SeoScanner.node.js.map +1 -1
- package/dist/analyzeUtils.js +20 -3
- package/dist/analyzeUtils.js.map +1 -1
- package/dist/networkUtils.js +34 -2
- package/dist/networkUtils.js.map +1 -1
- package/dist/nodes/SeoScanner/SeoScanner.node.js +321 -19
- package/dist/nodes/SeoScanner/analyzeUtils.js +20 -3
- package/dist/nodes/SeoScanner/networkUtils.js +34 -2
- package/dist/nodes/SeoScanner/robotsUtils.js +100 -19
- package/dist/nodes/SeoScanner/securityHeadersUtils.js +20 -18
- package/dist/robotsUtils.js +100 -19
- package/dist/robotsUtils.js.map +1 -1
- package/dist/securityHeadersUtils.js +20 -18
- package/dist/securityHeadersUtils.js.map +1 -1
- package/package.json +1 -1
package/dist/SeoScanner.node.js
CHANGED
|
@@ -85,6 +85,97 @@ function readNullableNumber(value) {
|
|
|
85
85
|
const numberValue = Number(value);
|
|
86
86
|
return Number.isFinite(numberValue) ? numberValue : undefined;
|
|
87
87
|
}
|
|
88
|
+
function readStringArray(value) {
|
|
89
|
+
if (value === undefined || value === null)
|
|
90
|
+
return undefined;
|
|
91
|
+
if (Array.isArray(value)) {
|
|
92
|
+
return value.flatMap((item) => readStringArray(item) || []);
|
|
93
|
+
}
|
|
94
|
+
if (typeof value !== 'string')
|
|
95
|
+
return [];
|
|
96
|
+
const trimmed = value.trim();
|
|
97
|
+
if (!trimmed)
|
|
98
|
+
return [];
|
|
99
|
+
if (trimmed.startsWith('[')) {
|
|
100
|
+
try {
|
|
101
|
+
const parsed = JSON.parse(trimmed);
|
|
102
|
+
if (Array.isArray(parsed))
|
|
103
|
+
return readStringArray(parsed) || [];
|
|
104
|
+
}
|
|
105
|
+
catch {
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
return trimmed.split(/\r?\n/).map((item) => item.trim()).filter(Boolean);
|
|
109
|
+
}
|
|
110
|
+
function uniqueStrings(values) {
|
|
111
|
+
const seen = new Set();
|
|
112
|
+
const result = [];
|
|
113
|
+
for (const value of values) {
|
|
114
|
+
const key = value.toLowerCase();
|
|
115
|
+
if (seen.has(key))
|
|
116
|
+
continue;
|
|
117
|
+
seen.add(key);
|
|
118
|
+
result.push(value);
|
|
119
|
+
}
|
|
120
|
+
return result;
|
|
121
|
+
}
|
|
122
|
+
function stripWww(hostname) {
|
|
123
|
+
return hostname.toLowerCase().replace(/^www\./, '');
|
|
124
|
+
}
|
|
125
|
+
function sameCrawlDomain(url1, url2) {
|
|
126
|
+
try {
|
|
127
|
+
return stripWww(new URL(url1).hostname) === stripWww(new URL(url2).hostname);
|
|
128
|
+
}
|
|
129
|
+
catch {
|
|
130
|
+
return false;
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
function normalizeCrawlAbsoluteUrl(value) {
|
|
134
|
+
if (!/^https?:\/\//i.test(value))
|
|
135
|
+
return '';
|
|
136
|
+
try {
|
|
137
|
+
const url = new URL(value);
|
|
138
|
+
if (url.protocol !== 'http:' && url.protocol !== 'https:')
|
|
139
|
+
return '';
|
|
140
|
+
url.hash = '';
|
|
141
|
+
url.searchParams.sort();
|
|
142
|
+
return url.href.replace(/\/$/, '');
|
|
143
|
+
}
|
|
144
|
+
catch {
|
|
145
|
+
return '';
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
function urlContainsPattern(url, pattern) {
|
|
149
|
+
const normalizedUrl = url.toLowerCase();
|
|
150
|
+
const normalizedPattern = pattern.toLowerCase();
|
|
151
|
+
try {
|
|
152
|
+
const parsed = new URL(url);
|
|
153
|
+
const pathAndSearch = `${parsed.pathname}${parsed.search}`.toLowerCase();
|
|
154
|
+
return normalizedUrl.includes(normalizedPattern) || pathAndSearch.includes(normalizedPattern);
|
|
155
|
+
}
|
|
156
|
+
catch {
|
|
157
|
+
return normalizedUrl.includes(normalizedPattern);
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
function normalizePlanInternalLimit(value) {
|
|
161
|
+
if (value === null)
|
|
162
|
+
return null;
|
|
163
|
+
if (typeof value !== 'number' || !Number.isFinite(value))
|
|
164
|
+
return undefined;
|
|
165
|
+
return Math.max(1, Math.round(value));
|
|
166
|
+
}
|
|
167
|
+
function planInternalLimitError(limit, planName) {
|
|
168
|
+
if (planName?.toLowerCase() === 'max' && limit === 100) {
|
|
169
|
+
return 'Tu plan Max permite escanear como máximo 100 páginas internas por escaneo. Para escanear más páginas, puedes usar el escaneo por API o contactar con soporte.';
|
|
170
|
+
}
|
|
171
|
+
return `Tu plan permite escanear como máximo ${limit} páginas internas por escaneo.`;
|
|
172
|
+
}
|
|
173
|
+
function requiredSlotsLimitError(availableRequiredSlots) {
|
|
174
|
+
if (availableRequiredSlots <= 0) {
|
|
175
|
+
return 'Ya has usado todo el límite de páginas internas de tu plan. Baja el número de páginas internas a escanear para poder añadir páginas obligatorias.';
|
|
176
|
+
}
|
|
177
|
+
return `Con la configuración actual solo puedes añadir ${availableRequiredSlots} páginas obligatorias. Baja el número de páginas internas a escanear o mejora tu plan.`;
|
|
178
|
+
}
|
|
88
179
|
function readHeader(headers, name) {
|
|
89
180
|
const direct = readString(headers[name]);
|
|
90
181
|
if (direct)
|
|
@@ -228,8 +319,17 @@ function getIncomingWebhookPayload(input) {
|
|
|
228
319
|
readBearerToken(authorization),
|
|
229
320
|
apiToken: readString(source.apiToken) ||
|
|
230
321
|
readHeader(headers, 'x-api-token'),
|
|
231
|
-
|
|
232
|
-
|
|
322
|
+
scanFullSite: readBoolean(source.scanFullSite),
|
|
323
|
+
scanInternalLinks: readBoolean(source.scanFullSite) ?? readBoolean(source.allsite) ?? readBoolean(source.scanInternalLinks),
|
|
324
|
+
maxUrls: readNumber(source.maxUrls),
|
|
325
|
+
maxInternalUrls: readNumber(source.maxInternalUrls) ?? readNumber(source.maxUrls),
|
|
326
|
+
requiredUrls: readStringArray(source.requiredUrls),
|
|
327
|
+
excludedPatterns: readStringArray(source.excludedPatterns),
|
|
328
|
+
ignoredUrls: readStringArray(source.ignoredUrls),
|
|
329
|
+
planName: readString(source.planName) || readString(source.webPlan) || readString(source.apiPlan),
|
|
330
|
+
planInternalPageLimit: readNullableNumber(source.planInternalPageLimit) ??
|
|
331
|
+
readNullableNumber(source.webInternalPageLimit) ??
|
|
332
|
+
readNullableNumber(source.maxWebScanUrls),
|
|
233
333
|
scanLimitSource: readString(source.scanLimitSource) === 'api' ? 'api' : undefined,
|
|
234
334
|
apiPlan: readString(source.apiPlan),
|
|
235
335
|
apiInternalPageLimit: readNullableNumber(source.apiInternalPageLimit),
|
|
@@ -287,6 +387,24 @@ class SeoScanner {
|
|
|
287
387
|
description: 'Número máximo de páginas internas a analizar. El rastreo sigue enlaces descubiertos en cada página y usa el sitemap como semilla si existe.',
|
|
288
388
|
displayOptions: { show: { scanInternalLinks: [true] } },
|
|
289
389
|
},
|
|
390
|
+
{
|
|
391
|
+
displayName: 'Páginas Obligatorias a Escanear',
|
|
392
|
+
name: 'requiredUrls',
|
|
393
|
+
type: 'string',
|
|
394
|
+
typeOptions: { alwaysOpenEditWindow: true },
|
|
395
|
+
default: '',
|
|
396
|
+
description: 'URLs completas del mismo dominio que se analizarán aunque no aparezcan en enlaces internos o sitemap. Una por línea.',
|
|
397
|
+
displayOptions: { show: { scanInternalLinks: [true] } },
|
|
398
|
+
},
|
|
399
|
+
{
|
|
400
|
+
displayName: 'Excluir URLs Que Contengan',
|
|
401
|
+
name: 'excludedPatterns',
|
|
402
|
+
type: 'string',
|
|
403
|
+
typeOptions: { alwaysOpenEditWindow: true },
|
|
404
|
+
default: '',
|
|
405
|
+
description: 'Fragmentos de URL que no se analizarán ni aparecerán como errores. Uno por línea.',
|
|
406
|
+
displayOptions: { show: { scanInternalLinks: [true] } },
|
|
407
|
+
},
|
|
290
408
|
{
|
|
291
409
|
displayName: 'Ignorar Fallos (Uno por línea)',
|
|
292
410
|
name: 'ignoredIssues',
|
|
@@ -578,6 +696,11 @@ class SeoScanner {
|
|
|
578
696
|
apiToken,
|
|
579
697
|
});
|
|
580
698
|
maxInternalUrls = apiScanLimit.effectiveInternalPageLimit;
|
|
699
|
+
const incomingPlanInternalLimit = normalizePlanInternalLimit(incoming.planInternalPageLimit);
|
|
700
|
+
const effectivePlanInternalLimit = apiScanLimit.apiInternalPageLimit !== undefined
|
|
701
|
+
? apiScanLimit.apiInternalPageLimit
|
|
702
|
+
: incomingPlanInternalLimit;
|
|
703
|
+
const effectivePlanName = incoming.planName || apiScanLimit.apiPlan || '';
|
|
581
704
|
const timeoutSeconds = Math.max(5, Math.min(60, this.getNodeParameter('timeoutSeconds', 0) ?? 15));
|
|
582
705
|
const timeoutMs = timeoutSeconds * 1000;
|
|
583
706
|
const followRedirects = this.getNodeParameter('followRedirects', 0) !== false;
|
|
@@ -604,7 +727,9 @@ class SeoScanner {
|
|
|
604
727
|
const generateHtmlReport = configuredHtmlReport || hasIncomingCallback;
|
|
605
728
|
const detailOpts = this.getNodeParameter('detailOptions', 0) || {};
|
|
606
729
|
const ignoredIssues = (this.getNodeParameter('ignoredIssues', 0, '') || '').split('\n').map(s => s.trim()).filter(Boolean);
|
|
607
|
-
const
|
|
730
|
+
const configuredRequiredUrls = (this.getNodeParameter('requiredUrls', 0, '') || '').split('\n').map(s => s.trim()).filter(Boolean);
|
|
731
|
+
const configuredExcludedPatterns = (this.getNodeParameter('excludedPatterns', 0, '') || '').split('\n').map(s => s.trim()).filter(Boolean);
|
|
732
|
+
const configuredIgnoredPages = (this.getNodeParameter('ignoredPages', 0, '') || '').split('\n').map(s => s.trim()).filter(Boolean);
|
|
608
733
|
const analyzeOpts = {
|
|
609
734
|
includeImageDetails: detailOpts.includeImageDetails !== false,
|
|
610
735
|
includeLinkDetails: detailOpts.includeLinkDetails !== false,
|
|
@@ -630,6 +755,91 @@ class SeoScanner {
|
|
|
630
755
|
await notifyIncomingCallbackError(incoming, apiToken, localize(errorMessage));
|
|
631
756
|
return [[buildErrorOutput({ message: localize(errorMessage), url, incoming, apiToken })]];
|
|
632
757
|
}
|
|
758
|
+
let requiredUrls = [];
|
|
759
|
+
let excludedPatterns = [];
|
|
760
|
+
let ignoredUrls = [];
|
|
761
|
+
if (scanInternalLinks) {
|
|
762
|
+
const normalizeUrlList = (values, label) => {
|
|
763
|
+
const urls = [];
|
|
764
|
+
const seen = new Set();
|
|
765
|
+
for (const value of values) {
|
|
766
|
+
const normalized = normalizeCrawlAbsoluteUrl(value);
|
|
767
|
+
if (!normalized)
|
|
768
|
+
return { ok: false, error: `${label} debe incluir URLs completas que empiecen por http:// o https://.` };
|
|
769
|
+
if (!sameCrawlDomain(normalized, baseUrl))
|
|
770
|
+
return { ok: false, error: `${label} solo puede incluir URLs del mismo dominio que se está escaneando.` };
|
|
771
|
+
const key = normalized.toLowerCase();
|
|
772
|
+
if (seen.has(key))
|
|
773
|
+
return { ok: false, error: `${label} no puede incluir URLs duplicadas.` };
|
|
774
|
+
seen.add(key);
|
|
775
|
+
urls.push(normalized);
|
|
776
|
+
}
|
|
777
|
+
return { ok: true, urls };
|
|
778
|
+
};
|
|
779
|
+
const requiredResult = normalizeUrlList(uniqueStrings([...configuredRequiredUrls, ...(incoming.requiredUrls || [])]), 'Páginas obligatorias a escanear');
|
|
780
|
+
if (!requiredResult.ok) {
|
|
781
|
+
await notifyIncomingCallbackError(incoming, apiToken, localize(requiredResult.error));
|
|
782
|
+
return [[buildErrorOutput({ message: localize(requiredResult.error), url: baseUrl, incoming, apiToken })]];
|
|
783
|
+
}
|
|
784
|
+
const ignoredResult = normalizeUrlList(uniqueStrings(incoming.ignoredUrls || []), 'URLs ignoradas');
|
|
785
|
+
if (!ignoredResult.ok) {
|
|
786
|
+
await notifyIncomingCallbackError(incoming, apiToken, localize(ignoredResult.error));
|
|
787
|
+
return [[buildErrorOutput({ message: localize(ignoredResult.error), url: baseUrl, incoming, apiToken })]];
|
|
788
|
+
}
|
|
789
|
+
requiredUrls = requiredResult.urls;
|
|
790
|
+
ignoredUrls = ignoredResult.urls;
|
|
791
|
+
if (requiredUrls.some((requiredUrl) => (0, urlUtils_1.normalizeUrlForDedupe)(requiredUrl).toLowerCase() === (0, urlUtils_1.normalizeUrlForDedupe)(baseUrl).toLowerCase())) {
|
|
792
|
+
const errorMessage = 'La URL principal ya se escanea aparte. No la añadas como página obligatoria.';
|
|
793
|
+
await notifyIncomingCallbackError(incoming, apiToken, localize(errorMessage));
|
|
794
|
+
return [[buildErrorOutput({ message: localize(errorMessage), url: baseUrl, incoming, apiToken })]];
|
|
795
|
+
}
|
|
796
|
+
const rawPatterns = uniqueStrings([
|
|
797
|
+
...configuredIgnoredPages,
|
|
798
|
+
...configuredExcludedPatterns,
|
|
799
|
+
...(incoming.excludedPatterns || []),
|
|
800
|
+
]);
|
|
801
|
+
for (const pattern of rawPatterns) {
|
|
802
|
+
const trimmed = pattern.trim();
|
|
803
|
+
if (!trimmed) {
|
|
804
|
+
const errorMessage = 'Las reglas de exclusión no pueden estar vacías.';
|
|
805
|
+
await notifyIncomingCallbackError(incoming, apiToken, localize(errorMessage));
|
|
806
|
+
return [[buildErrorOutput({ message: localize(errorMessage), url: baseUrl, incoming, apiToken })]];
|
|
807
|
+
}
|
|
808
|
+
if (trimmed === '/') {
|
|
809
|
+
const errorMessage = 'No puedes excluir solo "/", porque bloquearía todo el sitio.';
|
|
810
|
+
await notifyIncomingCallbackError(incoming, apiToken, localize(errorMessage));
|
|
811
|
+
return [[buildErrorOutput({ message: localize(errorMessage), url: baseUrl, incoming, apiToken })]];
|
|
812
|
+
}
|
|
813
|
+
if (urlContainsPattern(baseUrl, trimmed)) {
|
|
814
|
+
const errorMessage = 'Una regla de exclusión no puede bloquear directamente la URL principal.';
|
|
815
|
+
await notifyIncomingCallbackError(incoming, apiToken, localize(errorMessage));
|
|
816
|
+
return [[buildErrorOutput({ message: localize(errorMessage), url: baseUrl, incoming, apiToken })]];
|
|
817
|
+
}
|
|
818
|
+
excludedPatterns.push(trimmed);
|
|
819
|
+
}
|
|
820
|
+
if (typeof effectivePlanInternalLimit === 'number' && maxInternalUrls > effectivePlanInternalLimit) {
|
|
821
|
+
const errorMessage = planInternalLimitError(effectivePlanInternalLimit, effectivePlanName);
|
|
822
|
+
await notifyIncomingCallbackError(incoming, apiToken, localize(errorMessage));
|
|
823
|
+
return [[buildErrorOutput({ message: localize(errorMessage), url: baseUrl, incoming, apiToken })]];
|
|
824
|
+
}
|
|
825
|
+
if (typeof effectivePlanInternalLimit === 'number' && maxInternalUrls + requiredUrls.length > effectivePlanInternalLimit) {
|
|
826
|
+
const availableRequiredSlots = effectivePlanInternalLimit - maxInternalUrls;
|
|
827
|
+
const errorMessage = requiredSlotsLimitError(availableRequiredSlots);
|
|
828
|
+
await notifyIncomingCallbackError(incoming, apiToken, localize(errorMessage));
|
|
829
|
+
return [[buildErrorOutput({ message: localize(errorMessage), url: baseUrl, incoming, apiToken })]];
|
|
830
|
+
}
|
|
831
|
+
const ignoredUrlSet = new Set(ignoredUrls.map((ignoredUrl) => ignoredUrl.toLowerCase()));
|
|
832
|
+
if (requiredUrls.some((requiredUrl) => ignoredUrlSet.has(requiredUrl.toLowerCase()))) {
|
|
833
|
+
const errorMessage = 'Una página obligatoria también está marcada como ignorada.';
|
|
834
|
+
await notifyIncomingCallbackError(incoming, apiToken, localize(errorMessage));
|
|
835
|
+
return [[buildErrorOutput({ message: localize(errorMessage), url: baseUrl, incoming, apiToken })]];
|
|
836
|
+
}
|
|
837
|
+
if (requiredUrls.some((requiredUrl) => excludedPatterns.some((pattern) => urlContainsPattern(requiredUrl, pattern)))) {
|
|
838
|
+
const errorMessage = 'Una página obligatoria coincide con una regla de exclusión.';
|
|
839
|
+
await notifyIncomingCallbackError(incoming, apiToken, localize(errorMessage));
|
|
840
|
+
return [[buildErrorOutput({ message: localize(errorMessage), url: baseUrl, incoming, apiToken })]];
|
|
841
|
+
}
|
|
842
|
+
}
|
|
633
843
|
if (!apiToken) {
|
|
634
844
|
try {
|
|
635
845
|
validationInfo = await validateApiKeyWithApp({
|
|
@@ -881,42 +1091,96 @@ class SeoScanner {
|
|
|
881
1091
|
let internalResults = [];
|
|
882
1092
|
const internalHtmlByFinalUrl = new Map();
|
|
883
1093
|
let crawlLimitReached = false;
|
|
1094
|
+
const exactIgnoredUrls = new Set(ignoredUrls.map((ignoredUrl) => (0, urlUtils_1.normalizeUrlForDedupe)(ignoredUrl).toLowerCase()));
|
|
1095
|
+
const ignoredByRuleUrls = new Map();
|
|
1096
|
+
const trackIgnoredByRule = (candidate) => {
|
|
1097
|
+
const norm = (0, urlUtils_1.normalizeUrlForDedupe)(candidate).toLowerCase();
|
|
1098
|
+
if (!ignoredByRuleUrls.has(norm))
|
|
1099
|
+
ignoredByRuleUrls.set(norm, candidate);
|
|
1100
|
+
};
|
|
1101
|
+
const isSkippedByCrawlRules = (candidate) => {
|
|
1102
|
+
const norm = (0, urlUtils_1.normalizeUrlForDedupe)(candidate).toLowerCase();
|
|
1103
|
+
if (exactIgnoredUrls.has(norm))
|
|
1104
|
+
return true;
|
|
1105
|
+
return excludedPatterns.length > 0 && isIgnoredPageUrl(candidate, excludedPatterns);
|
|
1106
|
+
};
|
|
884
1107
|
if (scanInternalLinks) {
|
|
885
1108
|
const crawlQueue = [];
|
|
886
1109
|
const queuedUrls = new Set();
|
|
1110
|
+
const requiredUrlSet = new Set(requiredUrls.map((requiredUrl) => (0, urlUtils_1.normalizeUrlForDedupe)(requiredUrl).toLowerCase()));
|
|
887
1111
|
const scannedUrls = new Set([(0, urlUtils_1.normalizeUrlForDedupe)(mainResult.finalUrl), (0, urlUtils_1.normalizeUrlForDedupe)(baseUrl)]);
|
|
888
|
-
|
|
1112
|
+
let discoveredCrawlAttempts = 0;
|
|
1113
|
+
let queuedDiscoveredUrls = 0;
|
|
1114
|
+
const addToCrawlQueue = (candidates, source) => {
|
|
889
1115
|
for (const candidate of candidates) {
|
|
890
|
-
if (
|
|
1116
|
+
if (source === 'discovered' && discoveredCrawlAttempts + queuedDiscoveredUrls >= maxInternalUrls) {
|
|
891
1117
|
crawlLimitReached = true;
|
|
892
1118
|
break;
|
|
893
1119
|
}
|
|
894
1120
|
if (!candidate || !isLikelyHtmlPageUrl(candidate))
|
|
895
1121
|
continue;
|
|
896
|
-
if (!(
|
|
1122
|
+
if (!sameCrawlDomain(candidate, mainResult.finalUrl))
|
|
897
1123
|
continue;
|
|
898
|
-
if (
|
|
1124
|
+
if (isSkippedByCrawlRules(candidate)) {
|
|
1125
|
+
trackIgnoredByRule(candidate);
|
|
899
1126
|
continue;
|
|
1127
|
+
}
|
|
900
1128
|
const norm = (0, urlUtils_1.normalizeUrlForDedupe)(candidate);
|
|
901
1129
|
if (scannedUrls.has(norm) || queuedUrls.has(norm))
|
|
902
1130
|
continue;
|
|
903
1131
|
queuedUrls.add(norm);
|
|
904
|
-
|
|
1132
|
+
if (source === 'discovered')
|
|
1133
|
+
queuedDiscoveredUrls++;
|
|
1134
|
+
crawlQueue.push({ url: candidate, source });
|
|
905
1135
|
}
|
|
906
1136
|
};
|
|
907
|
-
addToCrawlQueue(
|
|
908
|
-
addToCrawlQueue(
|
|
909
|
-
|
|
910
|
-
|
|
1137
|
+
addToCrawlQueue(requiredUrls, 'required');
|
|
1138
|
+
addToCrawlQueue(mainResult.linksInternalUrls, 'discovered');
|
|
1139
|
+
addToCrawlQueue(sitemapPageUrls, 'discovered');
|
|
1140
|
+
while (crawlQueue.length > 0) {
|
|
1141
|
+
const queueItem = crawlQueue.shift();
|
|
1142
|
+
const link = queueItem.url;
|
|
1143
|
+
const isRequiredCrawlUrl = queueItem.source === 'required' || requiredUrlSet.has((0, urlUtils_1.normalizeUrlForDedupe)(link).toLowerCase());
|
|
1144
|
+
if (queueItem.source === 'discovered') {
|
|
1145
|
+
queuedDiscoveredUrls = Math.max(0, queuedDiscoveredUrls - 1);
|
|
1146
|
+
if (discoveredCrawlAttempts >= maxInternalUrls) {
|
|
1147
|
+
crawlLimitReached = true;
|
|
1148
|
+
break;
|
|
1149
|
+
}
|
|
1150
|
+
}
|
|
911
1151
|
const requestedNorm = (0, urlUtils_1.normalizeUrlForDedupe)(link);
|
|
912
1152
|
queuedUrls.delete(requestedNorm);
|
|
913
1153
|
if (scannedUrls.has(requestedNorm))
|
|
914
1154
|
continue;
|
|
1155
|
+
if (isSkippedByCrawlRules(link)) {
|
|
1156
|
+
trackIgnoredByRule(link);
|
|
1157
|
+
continue;
|
|
1158
|
+
}
|
|
915
1159
|
scannedUrls.add(requestedNorm);
|
|
916
1160
|
try {
|
|
917
1161
|
const { html: innerHtml, finalUrl: innerFinal, statusCode: innerStatus, responseTimeMs: innerTime, timeToFirstByteMs: innerTtfb } = await (0, networkUtils_1.fetchPage)(link, timeoutMs, fetchOpts);
|
|
918
|
-
if (!(
|
|
919
|
-
|
|
1162
|
+
if (!sameCrawlDomain(innerFinal, mainResult.finalUrl)) {
|
|
1163
|
+
const result = (0, analyzeUtils_1.createEmptySeoResult)(link, isRequiredCrawlUrl
|
|
1164
|
+
? 'Página obligatoria configurada por el usuario: la URL redirige fuera del dominio escaneado'
|
|
1165
|
+
: 'La URL redirige fuera del dominio escaneado');
|
|
1166
|
+
if (queueItem.source === 'discovered')
|
|
1167
|
+
discoveredCrawlAttempts++;
|
|
1168
|
+
result.requiredUrl = isRequiredCrawlUrl;
|
|
1169
|
+
internalResults.push(result);
|
|
1170
|
+
continue;
|
|
1171
|
+
}
|
|
1172
|
+
if (innerStatus >= 400) {
|
|
1173
|
+
const result = (0, analyzeUtils_1.createEmptySeoResult)(link, isRequiredCrawlUrl
|
|
1174
|
+
? `Página obligatoria configurada por el usuario devolvió código ${innerStatus}`
|
|
1175
|
+
: `La página devolvió código ${innerStatus}`);
|
|
1176
|
+
result.finalUrl = innerFinal;
|
|
1177
|
+
result.statusCode = innerStatus;
|
|
1178
|
+
result.responseTimeMs = innerTime;
|
|
1179
|
+
result.timeToFirstByteMs = innerTtfb;
|
|
1180
|
+
if (queueItem.source === 'discovered')
|
|
1181
|
+
discoveredCrawlAttempts++;
|
|
1182
|
+
result.requiredUrl = isRequiredCrawlUrl;
|
|
1183
|
+
internalResults.push(result);
|
|
920
1184
|
continue;
|
|
921
1185
|
}
|
|
922
1186
|
scannedUrls.add((0, urlUtils_1.normalizeUrlForDedupe)(innerFinal));
|
|
@@ -926,14 +1190,23 @@ class SeoScanner {
|
|
|
926
1190
|
result.statusCode = innerStatus;
|
|
927
1191
|
result.responseTimeMs = innerTime;
|
|
928
1192
|
internalHtmlByFinalUrl.set((0, urlUtils_1.normalizeUrlForDedupe)(innerFinal), innerHtml);
|
|
1193
|
+
result.requiredUrl = isRequiredCrawlUrl;
|
|
929
1194
|
internalResults.push(result);
|
|
930
|
-
|
|
1195
|
+
if (queueItem.source === 'discovered')
|
|
1196
|
+
discoveredCrawlAttempts++;
|
|
1197
|
+
addToCrawlQueue(result.linksInternalUrls || [], 'discovered');
|
|
931
1198
|
}
|
|
932
1199
|
catch {
|
|
933
|
-
|
|
1200
|
+
const result = (0, analyzeUtils_1.createEmptySeoResult)(link, isRequiredCrawlUrl
|
|
1201
|
+
? 'Página obligatoria configurada por el usuario: error al cargar la página (timeout o red)'
|
|
1202
|
+
: 'Error al cargar la página (timeout o red)');
|
|
1203
|
+
if (queueItem.source === 'discovered')
|
|
1204
|
+
discoveredCrawlAttempts++;
|
|
1205
|
+
result.requiredUrl = isRequiredCrawlUrl;
|
|
1206
|
+
internalResults.push(result);
|
|
934
1207
|
}
|
|
935
1208
|
}
|
|
936
|
-
if (crawlQueue.
|
|
1209
|
+
if (crawlQueue.some((item) => item.source === 'discovered') && discoveredCrawlAttempts >= maxInternalUrls) {
|
|
937
1210
|
crawlLimitReached = true;
|
|
938
1211
|
}
|
|
939
1212
|
output.internalPages = internalResults;
|
|
@@ -941,9 +1214,16 @@ class SeoScanner {
|
|
|
941
1214
|
output.message = `Escaneadas ${output.scannedUrls} páginas (1 principal + ${internalResults.length} URLs internas descubiertas).`;
|
|
942
1215
|
output.siteCrawlDiscovery = {
|
|
943
1216
|
limit: maxInternalUrls,
|
|
1217
|
+
requiredUrls: requiredUrls.length,
|
|
1218
|
+
totalInternalLimit: maxInternalUrls + requiredUrls.length,
|
|
1219
|
+
planInternalLimit: effectivePlanInternalLimit === undefined ? null : effectivePlanInternalLimit,
|
|
1220
|
+
discoveredCrawlLimit: maxInternalUrls,
|
|
1221
|
+
discoveredCrawlScanned: discoveredCrawlAttempts,
|
|
1222
|
+
excludedPatterns: excludedPatterns.length,
|
|
944
1223
|
sitemapSeedUrls: sitemapPageUrls.length,
|
|
945
1224
|
totalDiscoveredInternalUrls: scannedUrls.size + queuedUrls.size,
|
|
946
1225
|
pendingUrlsNotScanned: crawlQueue.length,
|
|
1226
|
+
ignoredUrlsCount: ignoredByRuleUrls.size,
|
|
947
1227
|
};
|
|
948
1228
|
if (analyzeOpts.checkBrokenLinks) {
|
|
949
1229
|
const maxSiteCheck = Math.min(100, Math.max(analyzeOpts.maxBrokenLinksToCheck ?? 15, 30));
|
|
@@ -952,6 +1232,8 @@ class SeoScanner {
|
|
|
952
1232
|
const toCheckSite = [];
|
|
953
1233
|
const addInternal = (url, anchor) => {
|
|
954
1234
|
const norm = (0, urlUtils_1.normalizeUrlForDedupe)(url);
|
|
1235
|
+
if (isSkippedByCrawlRules(url))
|
|
1236
|
+
return;
|
|
955
1237
|
if (!seenInternal.has(norm) && isHttp(url)) {
|
|
956
1238
|
seenInternal.add(norm);
|
|
957
1239
|
if (toCheckSite.length < maxSiteCheck)
|
|
@@ -960,6 +1242,8 @@ class SeoScanner {
|
|
|
960
1242
|
};
|
|
961
1243
|
const addExternal = (url, anchor) => {
|
|
962
1244
|
const norm = url.toLowerCase();
|
|
1245
|
+
if (isSkippedByCrawlRules(url))
|
|
1246
|
+
return;
|
|
963
1247
|
if (!seenExternal.has(norm) && isHttp(url)) {
|
|
964
1248
|
seenExternal.add(norm);
|
|
965
1249
|
if (toCheckSite.length < maxSiteCheck)
|
|
@@ -1041,14 +1325,24 @@ class SeoScanner {
|
|
|
1041
1325
|
}
|
|
1042
1326
|
if (sitemapPageUrls.length > 0) {
|
|
1043
1327
|
const allDiscoveredLinks = new Set();
|
|
1044
|
-
mainResult.linksInternalUrls.forEach(u =>
|
|
1328
|
+
mainResult.linksInternalUrls.forEach(u => {
|
|
1329
|
+
if (!isSkippedByCrawlRules(u))
|
|
1330
|
+
allDiscoveredLinks.add((0, urlUtils_1.normalizeUrlForDedupe)(u));
|
|
1331
|
+
});
|
|
1045
1332
|
if (internalResults.length > 0) {
|
|
1046
1333
|
internalResults.forEach(r => {
|
|
1047
|
-
(r.linksInternalUrls || []).forEach(u =>
|
|
1334
|
+
(r.linksInternalUrls || []).forEach(u => {
|
|
1335
|
+
if (!isSkippedByCrawlRules(u))
|
|
1336
|
+
allDiscoveredLinks.add((0, urlUtils_1.normalizeUrlForDedupe)(u));
|
|
1337
|
+
});
|
|
1048
1338
|
});
|
|
1049
1339
|
}
|
|
1050
1340
|
const orphanPages = [];
|
|
1051
1341
|
for (const sitemapUrl of sitemapPageUrls) {
|
|
1342
|
+
if (isSkippedByCrawlRules(sitemapUrl)) {
|
|
1343
|
+
trackIgnoredByRule(sitemapUrl);
|
|
1344
|
+
continue;
|
|
1345
|
+
}
|
|
1052
1346
|
const norm = (0, urlUtils_1.normalizeUrlForDedupe)(sitemapUrl);
|
|
1053
1347
|
if (norm !== (0, urlUtils_1.normalizeUrlForDedupe)(mainResult.finalUrl) &&
|
|
1054
1348
|
!internalResults.some(r => (0, urlUtils_1.normalizeUrlForDedupe)(r.finalUrl) === norm) &&
|
|
@@ -1076,6 +1370,14 @@ class SeoScanner {
|
|
|
1076
1370
|
output.summary.orphanPagesCount = 0;
|
|
1077
1371
|
}
|
|
1078
1372
|
}
|
|
1373
|
+
if (ignoredByRuleUrls.size > 0) {
|
|
1374
|
+
output.ignoredUrls = [...ignoredByRuleUrls.values()];
|
|
1375
|
+
output.ignoredUrlsCount = ignoredByRuleUrls.size;
|
|
1376
|
+
output.ignoredUrlsSummary = `${ignoredByRuleUrls.size} URLs ignoradas por reglas de exclusión.`;
|
|
1377
|
+
if (output.siteCrawlDiscovery && typeof output.siteCrawlDiscovery === 'object') {
|
|
1378
|
+
output.siteCrawlDiscovery.ignoredUrlsCount = ignoredByRuleUrls.size;
|
|
1379
|
+
}
|
|
1380
|
+
}
|
|
1079
1381
|
if (apiScanLimit.apiLimitSource) {
|
|
1080
1382
|
const apiCreditsRemaining = (0, apiLimitUtils_1.resolveApiCreditsRemaining)({ apiToken, incoming, validationInfo });
|
|
1081
1383
|
output.apiUsage = (0, apiLimitUtils_1.buildApiUsagePayload)(apiScanLimit, apiCreditsRemaining);
|