@crawlith/core 0.1.0 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +201 -0
- package/README.md +70 -0
- package/dist/analysis/analysis_list.html +35 -0
- package/dist/analysis/analysis_page.html +123 -0
- package/dist/analysis/analyze.d.ts +40 -5
- package/dist/analysis/analyze.js +395 -347
- package/dist/analysis/clustering.d.ts +23 -0
- package/dist/analysis/clustering.js +206 -0
- package/dist/analysis/content.d.ts +1 -1
- package/dist/analysis/content.js +11 -5
- package/dist/analysis/duplicate.d.ts +34 -0
- package/dist/analysis/duplicate.js +305 -0
- package/dist/analysis/heading.d.ts +116 -0
- package/dist/analysis/heading.js +356 -0
- package/dist/analysis/images.d.ts +1 -1
- package/dist/analysis/images.js +6 -5
- package/dist/analysis/links.d.ts +1 -1
- package/dist/analysis/links.js +8 -8
- package/dist/{scoring/orphanSeverity.d.ts → analysis/orphan.d.ts} +12 -23
- package/dist/{scoring/orphanSeverity.js → analysis/orphan.js} +9 -3
- package/dist/analysis/scoring.js +11 -2
- package/dist/analysis/seo.d.ts +8 -4
- package/dist/analysis/seo.js +41 -30
- package/dist/analysis/soft404.d.ts +17 -0
- package/dist/analysis/soft404.js +62 -0
- package/dist/analysis/structuredData.d.ts +1 -1
- package/dist/analysis/structuredData.js +5 -4
- package/dist/analysis/templates.d.ts +2 -0
- package/dist/analysis/templates.js +7 -0
- package/dist/application/index.d.ts +2 -0
- package/dist/application/index.js +2 -0
- package/dist/application/usecase.d.ts +3 -0
- package/dist/application/usecase.js +1 -0
- package/dist/application/usecases.d.ts +114 -0
- package/dist/application/usecases.js +201 -0
- package/dist/audit/index.js +1 -1
- package/dist/audit/transport.d.ts +1 -1
- package/dist/audit/transport.js +5 -4
- package/dist/audit/types.d.ts +1 -0
- package/dist/constants.d.ts +17 -0
- package/dist/constants.js +23 -0
- package/dist/core/scope/scopeManager.js +3 -0
- package/dist/core/security/ipGuard.d.ts +11 -0
- package/dist/core/security/ipGuard.js +71 -3
- package/dist/crawler/crawl.d.ts +4 -22
- package/dist/crawler/crawl.js +4 -335
- package/dist/crawler/crawler.d.ts +87 -0
- package/dist/crawler/crawler.js +683 -0
- package/dist/crawler/extract.d.ts +4 -1
- package/dist/crawler/extract.js +7 -2
- package/dist/crawler/fetcher.d.ts +2 -1
- package/dist/crawler/fetcher.js +26 -11
- package/dist/crawler/metricsRunner.d.ts +23 -1
- package/dist/crawler/metricsRunner.js +202 -72
- package/dist/crawler/normalize.d.ts +41 -0
- package/dist/crawler/normalize.js +119 -3
- package/dist/crawler/parser.d.ts +1 -3
- package/dist/crawler/parser.js +2 -49
- package/dist/crawler/resolver.d.ts +11 -0
- package/dist/crawler/resolver.js +67 -0
- package/dist/crawler/sitemap.d.ts +6 -0
- package/dist/crawler/sitemap.js +27 -17
- package/dist/crawler/trap.d.ts +5 -1
- package/dist/crawler/trap.js +23 -2
- package/dist/db/CrawlithDB.d.ts +110 -0
- package/dist/db/CrawlithDB.js +500 -0
- package/dist/db/graphLoader.js +42 -30
- package/dist/db/index.d.ts +11 -0
- package/dist/db/index.js +41 -29
- package/dist/db/migrations.d.ts +2 -0
- package/dist/db/{schema.js → migrations.js} +90 -43
- package/dist/db/pluginRegistry.d.ts +9 -0
- package/dist/db/pluginRegistry.js +19 -0
- package/dist/db/repositories/EdgeRepository.d.ts +13 -0
- package/dist/db/repositories/EdgeRepository.js +20 -0
- package/dist/db/repositories/MetricsRepository.d.ts +16 -8
- package/dist/db/repositories/MetricsRepository.js +28 -7
- package/dist/db/repositories/PageRepository.d.ts +15 -2
- package/dist/db/repositories/PageRepository.js +169 -25
- package/dist/db/repositories/SiteRepository.d.ts +9 -0
- package/dist/db/repositories/SiteRepository.js +13 -0
- package/dist/db/repositories/SnapshotRepository.d.ts +14 -5
- package/dist/db/repositories/SnapshotRepository.js +64 -5
- package/dist/db/reset.d.ts +9 -0
- package/dist/db/reset.js +32 -0
- package/dist/db/statements.d.ts +12 -0
- package/dist/db/statements.js +40 -0
- package/dist/diff/compare.d.ts +0 -5
- package/dist/diff/compare.js +0 -12
- package/dist/diff/service.d.ts +16 -0
- package/dist/diff/service.js +41 -0
- package/dist/domain/index.d.ts +4 -0
- package/dist/domain/index.js +4 -0
- package/dist/events.d.ts +56 -0
- package/dist/events.js +1 -0
- package/dist/graph/graph.d.ts +36 -42
- package/dist/graph/graph.js +26 -17
- package/dist/graph/hits.d.ts +23 -0
- package/dist/graph/hits.js +111 -0
- package/dist/graph/metrics.d.ts +0 -4
- package/dist/graph/metrics.js +25 -9
- package/dist/graph/pagerank.d.ts +17 -4
- package/dist/graph/pagerank.js +126 -91
- package/dist/graph/simhash.d.ts +6 -0
- package/dist/graph/simhash.js +14 -0
- package/dist/index.d.ts +29 -8
- package/dist/index.js +29 -8
- package/dist/lock/hashKey.js +1 -1
- package/dist/lock/lockManager.d.ts +5 -1
- package/dist/lock/lockManager.js +38 -13
- package/dist/plugin-system/plugin-cli.d.ts +10 -0
- package/dist/plugin-system/plugin-cli.js +31 -0
- package/dist/plugin-system/plugin-config.d.ts +16 -0
- package/dist/plugin-system/plugin-config.js +36 -0
- package/dist/plugin-system/plugin-loader.d.ts +17 -0
- package/dist/plugin-system/plugin-loader.js +122 -0
- package/dist/plugin-system/plugin-registry.d.ts +25 -0
- package/dist/plugin-system/plugin-registry.js +167 -0
- package/dist/plugin-system/plugin-types.d.ts +205 -0
- package/dist/plugin-system/plugin-types.js +1 -0
- package/dist/ports/index.d.ts +9 -0
- package/dist/ports/index.js +1 -0
- package/{src/report/sitegraph_template.ts → dist/report/crawl.html} +330 -81
- package/dist/report/crawlExport.d.ts +3 -0
- package/dist/report/{sitegraphExport.js → crawlExport.js} +3 -3
- package/dist/report/crawl_template.d.ts +1 -0
- package/dist/report/crawl_template.js +7 -0
- package/dist/report/export.d.ts +3 -0
- package/dist/report/export.js +81 -0
- package/dist/report/html.js +15 -216
- package/dist/report/insight.d.ts +27 -0
- package/dist/report/insight.js +103 -0
- package/dist/scoring/health.d.ts +56 -0
- package/dist/scoring/health.js +213 -0
- package/dist/utils/chalk.d.ts +6 -0
- package/dist/utils/chalk.js +41 -0
- package/dist/utils/secureConfig.d.ts +23 -0
- package/dist/utils/secureConfig.js +128 -0
- package/package.json +12 -6
- package/CHANGELOG.md +0 -7
- package/dist/db/schema.d.ts +0 -2
- package/dist/graph/cluster.d.ts +0 -6
- package/dist/graph/cluster.js +0 -173
- package/dist/graph/duplicate.d.ts +0 -10
- package/dist/graph/duplicate.js +0 -251
- package/dist/report/sitegraphExport.d.ts +0 -3
- package/dist/report/sitegraph_template.d.ts +0 -1
- package/dist/report/sitegraph_template.js +0 -630
- package/dist/scoring/hits.d.ts +0 -9
- package/dist/scoring/hits.js +0 -111
- package/src/analysis/analyze.ts +0 -548
- package/src/analysis/content.ts +0 -62
- package/src/analysis/images.ts +0 -28
- package/src/analysis/links.ts +0 -41
- package/src/analysis/scoring.ts +0 -59
- package/src/analysis/seo.ts +0 -82
- package/src/analysis/structuredData.ts +0 -62
- package/src/audit/dns.ts +0 -49
- package/src/audit/headers.ts +0 -98
- package/src/audit/index.ts +0 -66
- package/src/audit/scoring.ts +0 -232
- package/src/audit/transport.ts +0 -258
- package/src/audit/types.ts +0 -102
- package/src/core/network/proxyAdapter.ts +0 -21
- package/src/core/network/rateLimiter.ts +0 -39
- package/src/core/network/redirectController.ts +0 -47
- package/src/core/network/responseLimiter.ts +0 -34
- package/src/core/network/retryPolicy.ts +0 -57
- package/src/core/scope/domainFilter.ts +0 -45
- package/src/core/scope/scopeManager.ts +0 -52
- package/src/core/scope/subdomainPolicy.ts +0 -39
- package/src/core/security/ipGuard.ts +0 -92
- package/src/crawler/crawl.ts +0 -382
- package/src/crawler/extract.ts +0 -34
- package/src/crawler/fetcher.ts +0 -233
- package/src/crawler/metricsRunner.ts +0 -124
- package/src/crawler/normalize.ts +0 -108
- package/src/crawler/parser.ts +0 -190
- package/src/crawler/sitemap.ts +0 -73
- package/src/crawler/trap.ts +0 -96
- package/src/db/graphLoader.ts +0 -105
- package/src/db/index.ts +0 -70
- package/src/db/repositories/EdgeRepository.ts +0 -29
- package/src/db/repositories/MetricsRepository.ts +0 -49
- package/src/db/repositories/PageRepository.ts +0 -128
- package/src/db/repositories/SiteRepository.ts +0 -32
- package/src/db/repositories/SnapshotRepository.ts +0 -74
- package/src/db/schema.ts +0 -177
- package/src/diff/compare.ts +0 -84
- package/src/graph/cluster.ts +0 -192
- package/src/graph/duplicate.ts +0 -286
- package/src/graph/graph.ts +0 -172
- package/src/graph/metrics.ts +0 -110
- package/src/graph/pagerank.ts +0 -125
- package/src/graph/simhash.ts +0 -61
- package/src/index.ts +0 -30
- package/src/lock/hashKey.ts +0 -51
- package/src/lock/lockManager.ts +0 -124
- package/src/lock/pidCheck.ts +0 -13
- package/src/report/html.ts +0 -227
- package/src/report/sitegraphExport.ts +0 -58
- package/src/scoring/hits.ts +0 -131
- package/src/scoring/orphanSeverity.ts +0 -176
- package/src/utils/version.ts +0 -18
- package/tests/__snapshots__/orphanSeverity.test.ts.snap +0 -49
- package/tests/analysis.unit.test.ts +0 -98
- package/tests/analyze.integration.test.ts +0 -98
- package/tests/audit/dns.test.ts +0 -31
- package/tests/audit/headers.test.ts +0 -45
- package/tests/audit/scoring.test.ts +0 -133
- package/tests/audit/security.test.ts +0 -12
- package/tests/audit/transport.test.ts +0 -112
- package/tests/clustering.test.ts +0 -118
- package/tests/crawler.test.ts +0 -358
- package/tests/db.test.ts +0 -159
- package/tests/diff.test.ts +0 -67
- package/tests/duplicate.test.ts +0 -110
- package/tests/fetcher.test.ts +0 -106
- package/tests/fetcher_safety.test.ts +0 -85
- package/tests/fixtures/analyze-crawl.json +0 -26
- package/tests/hits.test.ts +0 -134
- package/tests/html_report.test.ts +0 -58
- package/tests/lock/lockManager.test.ts +0 -138
- package/tests/metrics.test.ts +0 -196
- package/tests/normalize.test.ts +0 -101
- package/tests/orphanSeverity.test.ts +0 -160
- package/tests/pagerank.test.ts +0 -98
- package/tests/parser.test.ts +0 -117
- package/tests/proxy_safety.test.ts +0 -57
- package/tests/redirect_safety.test.ts +0 -73
- package/tests/safety.test.ts +0 -114
- package/tests/scope.test.ts +0 -66
- package/tests/scoring.test.ts +0 -59
- package/tests/sitemap.test.ts +0 -88
- package/tests/soft404.test.ts +0 -41
- package/tests/trap.test.ts +0 -39
- package/tests/visualization_data.test.ts +0 -46
- package/tsconfig.json +0 -11
package/dist/analysis/seo.js
CHANGED
|
@@ -2,9 +2,10 @@ import { load } from 'cheerio';
|
|
|
2
2
|
function normalizedText(value) {
|
|
3
3
|
return (value ?? '').trim().toLowerCase();
|
|
4
4
|
}
|
|
5
|
-
export function analyzeTitle(
|
|
6
|
-
const
|
|
7
|
-
const
|
|
5
|
+
export function analyzeTitle($) {
|
|
6
|
+
const isString = typeof $ === 'string';
|
|
7
|
+
const cheerioObj = isString ? load($ || '<html></html>') : $;
|
|
8
|
+
const title = cheerioObj('title').first().text().trim();
|
|
8
9
|
if (!title) {
|
|
9
10
|
return { value: null, length: 0, status: 'missing' };
|
|
10
11
|
}
|
|
@@ -14,9 +15,10 @@ export function analyzeTitle(html) {
|
|
|
14
15
|
return { value: title, length: title.length, status: 'too_long' };
|
|
15
16
|
return { value: title, length: title.length, status: 'ok' };
|
|
16
17
|
}
|
|
17
|
-
export function analyzeMetaDescription(
|
|
18
|
-
const
|
|
19
|
-
const
|
|
18
|
+
export function analyzeMetaDescription($) {
|
|
19
|
+
const isString = typeof $ === 'string';
|
|
20
|
+
const cheerioObj = isString ? load($ || '<html></html>') : $;
|
|
21
|
+
const raw = cheerioObj('meta[name="description"]').attr('content');
|
|
20
22
|
if (raw === undefined) {
|
|
21
23
|
return { value: null, length: 0, status: 'missing' };
|
|
22
24
|
}
|
|
@@ -30,35 +32,44 @@ export function analyzeMetaDescription(html) {
|
|
|
30
32
|
return { value: description, length: description.length, status: 'too_long' };
|
|
31
33
|
return { value: description, length: description.length, status: 'ok' };
|
|
32
34
|
}
|
|
33
|
-
export function
|
|
34
|
-
const
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
if (!key)
|
|
38
|
-
continue;
|
|
39
|
-
counts.set(key, (counts.get(key) || 0) + 1);
|
|
40
|
-
}
|
|
41
|
-
return fields.map((field) => {
|
|
42
|
-
const key = normalizedText(field.value);
|
|
43
|
-
if (!key)
|
|
44
|
-
return field;
|
|
45
|
-
if ((counts.get(key) || 0) > 1) {
|
|
46
|
-
return { ...field, status: 'duplicate' };
|
|
47
|
-
}
|
|
48
|
-
return field;
|
|
49
|
-
});
|
|
50
|
-
}
|
|
51
|
-
export function analyzeH1(html, titleValue) {
|
|
52
|
-
const $ = load(html);
|
|
53
|
-
const h1Values = $('h1').toArray().map((el) => $(el).text().trim()).filter(Boolean);
|
|
35
|
+
export function analyzeH1($, titleValue) {
|
|
36
|
+
const isString = typeof $ === 'string';
|
|
37
|
+
const cheerioObj = isString ? load($ || '<html></html>') : $;
|
|
38
|
+
const h1Values = cheerioObj('h1').toArray().map((el) => cheerioObj(el).text().trim()).filter(Boolean);
|
|
54
39
|
const count = h1Values.length;
|
|
55
40
|
const first = h1Values[0] || null;
|
|
56
41
|
const matchesTitle = Boolean(first && titleValue && normalizedText(first) === normalizedText(titleValue));
|
|
57
42
|
if (count === 0) {
|
|
58
|
-
return { count, status: 'critical', matchesTitle };
|
|
43
|
+
return { count, status: 'critical', matchesTitle, value: null };
|
|
59
44
|
}
|
|
60
45
|
if (count > 1) {
|
|
61
|
-
return { count, status: 'warning', matchesTitle };
|
|
46
|
+
return { count, status: 'warning', matchesTitle, value: first };
|
|
62
47
|
}
|
|
63
|
-
return { count, status: 'ok', matchesTitle };
|
|
48
|
+
return { count, status: 'ok', matchesTitle, value: first };
|
|
49
|
+
}
|
|
50
|
+
export function applyDuplicateStatuses(items) {
|
|
51
|
+
const counts = new Map();
|
|
52
|
+
const normalizedToOriginal = new Map();
|
|
53
|
+
// First pass: count occurrences of each normalized value
|
|
54
|
+
for (const item of items) {
|
|
55
|
+
if (item.value) {
|
|
56
|
+
const normalized = normalizedText(item.value);
|
|
57
|
+
if (normalized) {
|
|
58
|
+
counts.set(normalized, (counts.get(normalized) || 0) + 1);
|
|
59
|
+
if (!normalizedToOriginal.has(normalized)) {
|
|
60
|
+
normalizedToOriginal.set(normalized, item.value);
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
// Second pass: apply duplicate status
|
|
66
|
+
return items.map(item => {
|
|
67
|
+
if (item.value) {
|
|
68
|
+
const normalized = normalizedText(item.value);
|
|
69
|
+
if ((counts.get(normalized) || 0) > 1) {
|
|
70
|
+
return { ...item, status: 'duplicate' };
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
return item;
|
|
74
|
+
});
|
|
64
75
|
}
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
export interface Soft404Result {
|
|
2
|
+
score: number;
|
|
3
|
+
reason: string;
|
|
4
|
+
}
|
|
5
|
+
/**
|
|
6
|
+
* Service to analyze HTML content for soft 404 signals.
|
|
7
|
+
* Extracts signals from title, H1, body content length, and outlinks.
|
|
8
|
+
*/
|
|
9
|
+
export declare class Soft404Service {
|
|
10
|
+
/**
|
|
11
|
+
* Analyzes HTML string to determine probability of being a soft 404 page.
|
|
12
|
+
* @param {string | undefined} html - Raw HTML source code.
|
|
13
|
+
* @param {number} outLinks - Total number of outbound links extracted during parsing.
|
|
14
|
+
* @returns {Soft404Result} A calculated score between 0.0 and 1.0, and the matched reasons.
|
|
15
|
+
*/
|
|
16
|
+
analyze(html: string | undefined, outLinks: number): Soft404Result;
|
|
17
|
+
}
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
import * as cheerio from 'cheerio';
|
|
2
|
+
/**
|
|
3
|
+
* Service to analyze HTML content for soft 404 signals.
|
|
4
|
+
* Extracts signals from title, H1, body content length, and outlinks.
|
|
5
|
+
*/
|
|
6
|
+
export class Soft404Service {
|
|
7
|
+
/**
|
|
8
|
+
* Analyzes HTML string to determine probability of being a soft 404 page.
|
|
9
|
+
* @param {string | undefined} html - Raw HTML source code.
|
|
10
|
+
* @param {number} outLinks - Total number of outbound links extracted during parsing.
|
|
11
|
+
* @returns {Soft404Result} A calculated score between 0.0 and 1.0, and the matched reasons.
|
|
12
|
+
*/
|
|
13
|
+
analyze(html, outLinks) {
|
|
14
|
+
if (!html)
|
|
15
|
+
return { score: 0, reason: '' };
|
|
16
|
+
let score = 0;
|
|
17
|
+
const signals = [];
|
|
18
|
+
const $ = cheerio.load(html);
|
|
19
|
+
$('script, style, noscript, iframe').remove();
|
|
20
|
+
const cleanText = $('body').text().replace(/\s+/g, ' ').trim();
|
|
21
|
+
const title = $('title').text().toLowerCase();
|
|
22
|
+
const h1Text = $('h1').first().text().toLowerCase();
|
|
23
|
+
const bodyText = cleanText.toLowerCase();
|
|
24
|
+
const errorPatterns = ['404', 'not found', 'error', "doesn't exist", 'unavailable', 'invalid'];
|
|
25
|
+
for (const pattern of errorPatterns) {
|
|
26
|
+
if (title.includes(pattern)) {
|
|
27
|
+
score += 0.4;
|
|
28
|
+
signals.push(`title_contains_${pattern}`);
|
|
29
|
+
break;
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
for (const pattern of errorPatterns) {
|
|
33
|
+
if (h1Text.includes(pattern)) {
|
|
34
|
+
score += 0.3;
|
|
35
|
+
signals.push(`h1_contains_${pattern}`);
|
|
36
|
+
break;
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
if (bodyText.includes('page not found') || bodyText.includes('404 error')) {
|
|
40
|
+
score += 0.2;
|
|
41
|
+
signals.push('body_error_phrase');
|
|
42
|
+
}
|
|
43
|
+
const words = cleanText.split(/\s+/).filter(w => w.length > 0);
|
|
44
|
+
if (words.length < 50) {
|
|
45
|
+
score += 0.3;
|
|
46
|
+
signals.push('very_low_word_count');
|
|
47
|
+
}
|
|
48
|
+
else if (words.length < 150) {
|
|
49
|
+
score += 0.1;
|
|
50
|
+
signals.push('low_word_count');
|
|
51
|
+
}
|
|
52
|
+
if (outLinks === 0) {
|
|
53
|
+
score += 0.2;
|
|
54
|
+
signals.push('no_outbound_links');
|
|
55
|
+
}
|
|
56
|
+
score = Math.min(1.0, score);
|
|
57
|
+
return {
|
|
58
|
+
score: Number(score.toFixed(2)),
|
|
59
|
+
reason: signals.join(', ')
|
|
60
|
+
};
|
|
61
|
+
}
|
|
62
|
+
}
|
|
@@ -1,14 +1,15 @@
|
|
|
1
1
|
import { load } from 'cheerio';
|
|
2
|
-
export function analyzeStructuredData(
|
|
3
|
-
const
|
|
4
|
-
const
|
|
2
|
+
export function analyzeStructuredData($) {
|
|
3
|
+
const isString = typeof $ === 'string';
|
|
4
|
+
const cheerioObj = isString ? load($ || '<html></html>') : $;
|
|
5
|
+
const scripts = cheerioObj('script[type="application/ld+json"]').toArray();
|
|
5
6
|
if (scripts.length === 0) {
|
|
6
7
|
return { present: false, types: [], valid: false };
|
|
7
8
|
}
|
|
8
9
|
const types = new Set();
|
|
9
10
|
let valid = true;
|
|
10
11
|
for (const script of scripts) {
|
|
11
|
-
const raw =
|
|
12
|
+
const raw = cheerioObj(script).text().trim();
|
|
12
13
|
if (!raw) {
|
|
13
14
|
valid = false;
|
|
14
15
|
continue;
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
import fs from 'node:fs';
|
|
2
|
+
import path from 'node:path';
|
|
3
|
+
import { fileURLToPath } from 'node:url';
|
|
4
|
+
const __filename = fileURLToPath(import.meta.url);
|
|
5
|
+
const __dirname = path.dirname(__filename);
|
|
6
|
+
export const ANALYSIS_LIST_TEMPLATE = fs.readFileSync(path.join(__dirname, 'analysis_list.html'), 'utf-8');
|
|
7
|
+
export const ANALYSIS_PAGE_TEMPLATE = fs.readFileSync(path.join(__dirname, 'analysis_page.html'), 'utf-8');
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
import { type AnalyzeOptions, type AnalysisResult } from '../analysis/analyze.js';
|
|
2
|
+
import { compareGraphs } from '../diff/compare.js';
|
|
3
|
+
import type { CrawlithPlugin, PluginContext } from '../plugin-system/plugin-types.js';
|
|
4
|
+
import type { UseCase } from './usecase.js';
|
|
5
|
+
import type { Graph } from '../graph/graph.js';
|
|
6
|
+
import type { EngineContext } from '../events.js';
|
|
7
|
+
export interface CrawlSitegraphResult {
|
|
8
|
+
snapshotId: number;
|
|
9
|
+
graph: Graph;
|
|
10
|
+
metrics?: any;
|
|
11
|
+
healthData?: any;
|
|
12
|
+
}
|
|
13
|
+
export interface SiteCrawlInput {
|
|
14
|
+
url: string;
|
|
15
|
+
limit?: number;
|
|
16
|
+
depth?: number;
|
|
17
|
+
concurrency?: number;
|
|
18
|
+
stripQuery?: boolean;
|
|
19
|
+
ignoreRobots?: boolean;
|
|
20
|
+
sitemap?: string | boolean;
|
|
21
|
+
debug?: boolean;
|
|
22
|
+
detectSoft404?: boolean;
|
|
23
|
+
detectTraps?: boolean;
|
|
24
|
+
rate?: number;
|
|
25
|
+
maxBytes?: number;
|
|
26
|
+
allowedDomains?: string[];
|
|
27
|
+
deniedDomains?: string[];
|
|
28
|
+
includeSubdomains?: boolean;
|
|
29
|
+
proxyUrl?: string;
|
|
30
|
+
maxRedirects?: number;
|
|
31
|
+
userAgent?: string;
|
|
32
|
+
clustering?: boolean;
|
|
33
|
+
clusterThreshold?: number;
|
|
34
|
+
minClusterSize?: number;
|
|
35
|
+
heading?: boolean;
|
|
36
|
+
failOnCritical?: boolean;
|
|
37
|
+
scoreBreakdown?: boolean;
|
|
38
|
+
computeHits?: boolean;
|
|
39
|
+
computePagerank?: boolean;
|
|
40
|
+
orphans?: boolean;
|
|
41
|
+
orphanSeverity?: 'low' | 'medium' | 'high';
|
|
42
|
+
includeSoftOrphans?: boolean;
|
|
43
|
+
minInbound?: number;
|
|
44
|
+
plugins?: CrawlithPlugin[];
|
|
45
|
+
context?: PluginContext;
|
|
46
|
+
}
|
|
47
|
+
export declare class CrawlSitegraph implements UseCase<SiteCrawlInput, CrawlSitegraphResult> {
|
|
48
|
+
execute(input: SiteCrawlInput): Promise<CrawlSitegraphResult>;
|
|
49
|
+
}
|
|
50
|
+
export declare class AnalyzeSnapshot implements UseCase<{
|
|
51
|
+
url: string;
|
|
52
|
+
options: AnalyzeOptions;
|
|
53
|
+
plugins?: CrawlithPlugin[];
|
|
54
|
+
context?: PluginContext;
|
|
55
|
+
}, AnalysisResult> {
|
|
56
|
+
execute(input: {
|
|
57
|
+
url: string;
|
|
58
|
+
options: AnalyzeOptions;
|
|
59
|
+
plugins?: CrawlithPlugin[];
|
|
60
|
+
context?: PluginContext;
|
|
61
|
+
}): Promise<AnalysisResult>;
|
|
62
|
+
}
|
|
63
|
+
export interface PageAnalysisInput {
|
|
64
|
+
url: string;
|
|
65
|
+
live?: boolean;
|
|
66
|
+
snapshotId?: number;
|
|
67
|
+
seo?: boolean;
|
|
68
|
+
content?: boolean;
|
|
69
|
+
accessibility?: boolean;
|
|
70
|
+
rate?: number;
|
|
71
|
+
proxyUrl?: string;
|
|
72
|
+
userAgent?: string;
|
|
73
|
+
maxRedirects?: number;
|
|
74
|
+
maxBytes?: number;
|
|
75
|
+
clustering?: boolean;
|
|
76
|
+
clusterThreshold?: number;
|
|
77
|
+
minClusterSize?: number;
|
|
78
|
+
debug?: boolean;
|
|
79
|
+
allPages?: boolean;
|
|
80
|
+
sitemap?: string | boolean;
|
|
81
|
+
heading?: boolean;
|
|
82
|
+
health?: boolean;
|
|
83
|
+
failOnCritical?: boolean;
|
|
84
|
+
scoreBreakdown?: boolean;
|
|
85
|
+
computeHits?: boolean;
|
|
86
|
+
computePagerank?: boolean;
|
|
87
|
+
orphans?: boolean;
|
|
88
|
+
orphanSeverity?: 'low' | 'medium' | 'high';
|
|
89
|
+
includeSoftOrphans?: boolean;
|
|
90
|
+
minInbound?: number;
|
|
91
|
+
plugins?: CrawlithPlugin[];
|
|
92
|
+
context?: PluginContext;
|
|
93
|
+
}
|
|
94
|
+
export declare class PageAnalysisUseCase implements UseCase<PageAnalysisInput, AnalysisResult> {
|
|
95
|
+
private readonly context?;
|
|
96
|
+
constructor(context?: EngineContext | undefined);
|
|
97
|
+
execute(input: PageAnalysisInput): Promise<AnalysisResult>;
|
|
98
|
+
}
|
|
99
|
+
export declare class ExportReport implements UseCase<{
|
|
100
|
+
snapshotId: number;
|
|
101
|
+
}, string> {
|
|
102
|
+
execute(input: {
|
|
103
|
+
snapshotId: number;
|
|
104
|
+
}): Promise<string>;
|
|
105
|
+
}
|
|
106
|
+
export declare class DiffSnapshots implements UseCase<{
|
|
107
|
+
oldSnapshotId: number;
|
|
108
|
+
newSnapshotId: number;
|
|
109
|
+
}, ReturnType<typeof compareGraphs>> {
|
|
110
|
+
execute(input: {
|
|
111
|
+
oldSnapshotId: number;
|
|
112
|
+
newSnapshotId: number;
|
|
113
|
+
}): Promise<import("../diff/compare.js").DiffResult>;
|
|
114
|
+
}
|
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
import { crawl } from '../crawler/crawl.js';
|
|
2
|
+
import { DEFAULTS } from '../constants.js';
|
|
3
|
+
import { runPostCrawlMetrics } from '../crawler/metricsRunner.js';
|
|
4
|
+
import { analyzeSite } from '../analysis/analyze.js';
|
|
5
|
+
import { loadGraphFromSnapshot } from '../db/graphLoader.js';
|
|
6
|
+
import { compareGraphs } from '../diff/compare.js';
|
|
7
|
+
import { SiteRepository } from '../db/repositories/SiteRepository.js';
|
|
8
|
+
import { SnapshotRepository } from '../db/repositories/SnapshotRepository.js';
|
|
9
|
+
import { PluginRegistry } from '../plugin-system/plugin-registry.js';
|
|
10
|
+
import { getCrawlithDB } from '../db/index.js';
|
|
11
|
+
import { UrlUtil } from '../crawler/normalize.js';
|
|
12
|
+
export class CrawlSitegraph {
|
|
13
|
+
async execute(input) {
|
|
14
|
+
const ctx = input.context ?? { command: 'crawl', scope: 'crawl' };
|
|
15
|
+
ctx.scope = 'crawl';
|
|
16
|
+
ctx.db = getCrawlithDB();
|
|
17
|
+
const registry = new PluginRegistry(input.plugins ?? []);
|
|
18
|
+
await registry.applyStorage(ctx);
|
|
19
|
+
await registry.runHook('onInit', ctx);
|
|
20
|
+
if (ctx.terminate)
|
|
21
|
+
return { snapshotId: 0, graph: undefined };
|
|
22
|
+
await registry.runHook('onCrawlStart', ctx);
|
|
23
|
+
if (ctx.terminate)
|
|
24
|
+
return { snapshotId: 0, graph: undefined };
|
|
25
|
+
const policy = (ctx.metadata?.crawlPolicy || {});
|
|
26
|
+
// Map the unified DTO into the underlying CrawlOptions
|
|
27
|
+
const crawlOpts = {
|
|
28
|
+
limit: input.limit ?? DEFAULTS.CRAWL_LIMIT,
|
|
29
|
+
depth: input.depth ?? DEFAULTS.MAX_DEPTH,
|
|
30
|
+
concurrency: input.concurrency,
|
|
31
|
+
stripQuery: input.stripQuery,
|
|
32
|
+
ignoreRobots: policy.ignoreRobots !== undefined ? policy.ignoreRobots : input.ignoreRobots,
|
|
33
|
+
sitemap: input.sitemap,
|
|
34
|
+
debug: input.debug,
|
|
35
|
+
detectSoft404: input.detectSoft404,
|
|
36
|
+
detectTraps: input.detectTraps,
|
|
37
|
+
rate: policy.rate !== undefined ? policy.rate : input.rate,
|
|
38
|
+
maxBytes: policy.maxBytes !== undefined ? policy.maxBytes : input.maxBytes,
|
|
39
|
+
allowedDomains: policy.allowedDomains?.length ? policy.allowedDomains : input.allowedDomains,
|
|
40
|
+
deniedDomains: policy.deniedDomains?.length ? policy.deniedDomains : input.deniedDomains,
|
|
41
|
+
includeSubdomains: policy.includeSubdomains !== undefined ? policy.includeSubdomains : input.includeSubdomains,
|
|
42
|
+
proxyUrl: policy.proxyUrl !== undefined ? policy.proxyUrl : input.proxyUrl,
|
|
43
|
+
maxRedirects: policy.maxRedirects !== undefined ? policy.maxRedirects : input.maxRedirects,
|
|
44
|
+
userAgent: policy.userAgent !== undefined ? policy.userAgent : input.userAgent,
|
|
45
|
+
registry: registry
|
|
46
|
+
};
|
|
47
|
+
// Build an EngineContext from the plugin context's emit so per-page logs reach OutputController
|
|
48
|
+
const engineContext = ctx.emit
|
|
49
|
+
? { emit: (e) => ctx.emit(e) }
|
|
50
|
+
: undefined;
|
|
51
|
+
const snapshotId = await crawl(input.url, crawlOpts, engineContext);
|
|
52
|
+
const graph = loadGraphFromSnapshot(snapshotId);
|
|
53
|
+
// Ensure plugin hooks that persist data are scoped to the created snapshot.
|
|
54
|
+
ctx.snapshotId = snapshotId;
|
|
55
|
+
await registry.runHook('onGraphBuilt', ctx, graph);
|
|
56
|
+
await registry.runHook('onMetrics', ctx, graph);
|
|
57
|
+
const db = getCrawlithDB().unsafeGetRawDb();
|
|
58
|
+
const siteRepo = new SiteRepository(db);
|
|
59
|
+
const snapshotRepo = new SnapshotRepository(db);
|
|
60
|
+
const snapshot = snapshotRepo.getSnapshot(snapshotId);
|
|
61
|
+
let resolvedOrigin = '';
|
|
62
|
+
if (snapshot) {
|
|
63
|
+
const site = siteRepo.getSiteById(snapshot.site_id);
|
|
64
|
+
if (site?.preferred_url) {
|
|
65
|
+
try {
|
|
66
|
+
resolvedOrigin = new URL(site.preferred_url).origin;
|
|
67
|
+
}
|
|
68
|
+
catch { /* ignore */ }
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
const result = runPostCrawlMetrics(snapshotId, crawlOpts.depth, {
|
|
72
|
+
context: undefined,
|
|
73
|
+
limitReached: false,
|
|
74
|
+
graphInstance: graph,
|
|
75
|
+
clustering: input.clustering ?? true,
|
|
76
|
+
clusterThreshold: input.clusterThreshold,
|
|
77
|
+
minClusterSize: input.minClusterSize,
|
|
78
|
+
heading: input.heading ?? true,
|
|
79
|
+
// Always compute and persist health score for full crawls.
|
|
80
|
+
health: true,
|
|
81
|
+
computeHits: input.computeHits ?? true,
|
|
82
|
+
computePagerank: input.computePagerank ?? true,
|
|
83
|
+
orphans: input.orphans ?? true,
|
|
84
|
+
orphanSeverity: input.orphanSeverity ?? true,
|
|
85
|
+
includeSoftOrphans: input.includeSoftOrphans ?? true,
|
|
86
|
+
minInbound: input.minInbound,
|
|
87
|
+
rootOrigin: resolvedOrigin || (input.url.startsWith('http://') || input.url.startsWith('https://')
|
|
88
|
+
? new URL(input.url).origin
|
|
89
|
+
: `https://${UrlUtil.extractDomain(input.url)}`)
|
|
90
|
+
});
|
|
91
|
+
const metrics = result?.metrics;
|
|
92
|
+
const healthData = result?.healthData;
|
|
93
|
+
if (ctx.db) {
|
|
94
|
+
ctx.db.aggregateScoreProviders(snapshotId, registry.pluginsList);
|
|
95
|
+
}
|
|
96
|
+
await registry.runHook('onReport', ctx, { snapshotId, graph, metrics, healthData });
|
|
97
|
+
return { snapshotId, graph, metrics, healthData };
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
export class AnalyzeSnapshot {
|
|
101
|
+
async execute(input) {
|
|
102
|
+
const result = await analyzeSite(input.url, { ...input.options, live: false });
|
|
103
|
+
if (input.plugins && input.plugins.length > 0) {
|
|
104
|
+
const registry = new PluginRegistry(input.plugins);
|
|
105
|
+
const ctx = {
|
|
106
|
+
command: 'analyze',
|
|
107
|
+
scope: 'crawl',
|
|
108
|
+
...(input.context || {}),
|
|
109
|
+
db: getCrawlithDB()
|
|
110
|
+
};
|
|
111
|
+
await registry.runHook('onInit', ctx);
|
|
112
|
+
await registry.runHook('onReport', ctx, result);
|
|
113
|
+
}
|
|
114
|
+
return result;
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
export class PageAnalysisUseCase {
|
|
118
|
+
context;
|
|
119
|
+
constructor(context) {
|
|
120
|
+
this.context = context;
|
|
121
|
+
}
|
|
122
|
+
async execute(input) {
|
|
123
|
+
// When running a live single-page analysis, enable all content modules
|
|
124
|
+
// by default (they all work well on a single page). Callers can still
|
|
125
|
+
// explicitly pass false to disable any of them.
|
|
126
|
+
const isLive = !!input.live;
|
|
127
|
+
const seo = input.seo ?? (isLive ? true : undefined);
|
|
128
|
+
const content = input.content ?? (isLive ? true : undefined);
|
|
129
|
+
const accessibility = input.accessibility ?? (isLive ? true : undefined);
|
|
130
|
+
const health = input.health ?? (isLive ? true : undefined);
|
|
131
|
+
const heading = input.heading ?? (isLive ? true : undefined);
|
|
132
|
+
const result = await analyzeSite(input.url, {
|
|
133
|
+
live: input.live,
|
|
134
|
+
snapshotId: input.snapshotId,
|
|
135
|
+
seo,
|
|
136
|
+
content,
|
|
137
|
+
accessibility,
|
|
138
|
+
rate: input.rate,
|
|
139
|
+
proxyUrl: input.proxyUrl,
|
|
140
|
+
userAgent: input.userAgent,
|
|
141
|
+
maxRedirects: input.maxRedirects,
|
|
142
|
+
maxBytes: input.maxBytes,
|
|
143
|
+
clustering: input.clustering,
|
|
144
|
+
clusterThreshold: input.clusterThreshold,
|
|
145
|
+
minClusterSize: input.minClusterSize,
|
|
146
|
+
sitemap: input.sitemap,
|
|
147
|
+
heading,
|
|
148
|
+
health,
|
|
149
|
+
failOnCritical: input.failOnCritical,
|
|
150
|
+
scoreBreakdown: input.scoreBreakdown,
|
|
151
|
+
computeHits: input.computeHits,
|
|
152
|
+
computePagerank: input.computePagerank,
|
|
153
|
+
orphans: input.orphans,
|
|
154
|
+
orphanSeverity: input.orphanSeverity,
|
|
155
|
+
includeSoftOrphans: input.includeSoftOrphans,
|
|
156
|
+
minInbound: input.minInbound,
|
|
157
|
+
debug: input.debug,
|
|
158
|
+
allPages: input.allPages,
|
|
159
|
+
}, this.context);
|
|
160
|
+
// Run plugins with page scope — only onInit + onPage are called
|
|
161
|
+
if (input.plugins && input.plugins.length > 0) {
|
|
162
|
+
const { PluginRegistry } = await import('../plugin-system/plugin-registry.js');
|
|
163
|
+
const registry = new PluginRegistry(input.plugins);
|
|
164
|
+
const pluginCtx = {
|
|
165
|
+
command: 'page',
|
|
166
|
+
scope: 'page',
|
|
167
|
+
targetUrl: input.url,
|
|
168
|
+
snapshotId: result.snapshotId,
|
|
169
|
+
live: input.live || !!(input.context?.flags?.live),
|
|
170
|
+
...(input.context || {}),
|
|
171
|
+
db: getCrawlithDB(),
|
|
172
|
+
};
|
|
173
|
+
await registry.applyStorage(pluginCtx);
|
|
174
|
+
await registry.runHook('onInit', pluginCtx);
|
|
175
|
+
// Fire onPage once per analyzed page (normally just 1 for the page command)
|
|
176
|
+
const inputOrigin = new URL(input.url).origin;
|
|
177
|
+
for (const page of result.pages) {
|
|
178
|
+
const absoluteUrl = UrlUtil.toAbsolute(page.url, inputOrigin);
|
|
179
|
+
await registry.runHook('onPage', pluginCtx, {
|
|
180
|
+
url: absoluteUrl,
|
|
181
|
+
html: page.html ?? '',
|
|
182
|
+
status: page.status,
|
|
183
|
+
});
|
|
184
|
+
}
|
|
185
|
+
if (pluginCtx.db && result.snapshotId) {
|
|
186
|
+
pluginCtx.db.aggregateScoreProviders(result.snapshotId, registry.pluginsList);
|
|
187
|
+
}
|
|
188
|
+
}
|
|
189
|
+
return result;
|
|
190
|
+
}
|
|
191
|
+
}
|
|
192
|
+
export class ExportReport {
|
|
193
|
+
async execute(input) {
|
|
194
|
+
return JSON.stringify(loadGraphFromSnapshot(input.snapshotId).toJSON());
|
|
195
|
+
}
|
|
196
|
+
}
|
|
197
|
+
export class DiffSnapshots {
|
|
198
|
+
async execute(input) {
|
|
199
|
+
return compareGraphs(loadGraphFromSnapshot(input.oldSnapshotId), loadGraphFromSnapshot(input.newSnapshotId));
|
|
200
|
+
}
|
|
201
|
+
}
|
package/dist/audit/index.js
CHANGED
|
@@ -26,7 +26,7 @@ export async function auditUrl(urlStr, options = {}) {
|
|
|
26
26
|
// We handle transport errors differently as they are fatal for the audit (e.g. connection refused)
|
|
27
27
|
// DNS errors might return partial results but usually if transport works, DNS worked (unless transport used IP)
|
|
28
28
|
const dnsPromise = resolveDns(url.hostname);
|
|
29
|
-
const transportPromise = analyzeTransport(urlStr, timeout);
|
|
29
|
+
const transportPromise = analyzeTransport(urlStr, timeout, options.userAgent);
|
|
30
30
|
const [dnsResult, transportResult] = await Promise.all([
|
|
31
31
|
dnsPromise,
|
|
32
32
|
transportPromise
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import { TransportDiagnostics, PerformanceMetrics, AuditIssue } from './types.js';
|
|
2
|
-
export declare function analyzeTransport(targetUrl: string, timeout: number): Promise<{
|
|
2
|
+
export declare function analyzeTransport(targetUrl: string, timeout: number, userAgent?: string): Promise<{
|
|
3
3
|
transport: TransportDiagnostics;
|
|
4
4
|
performance: PerformanceMetrics;
|
|
5
5
|
issues: AuditIssue[];
|
package/dist/audit/transport.js
CHANGED
|
@@ -3,7 +3,8 @@ import http from 'node:http';
|
|
|
3
3
|
import tls from 'node:tls';
|
|
4
4
|
import { URL } from 'node:url';
|
|
5
5
|
import { IPGuard } from '../core/security/ipGuard.js';
|
|
6
|
-
|
|
6
|
+
import { DEFAULTS } from '../constants.js';
|
|
7
|
+
export async function analyzeTransport(targetUrl, timeout, userAgent) {
|
|
7
8
|
const maxRedirects = 10;
|
|
8
9
|
let currentUrl = targetUrl;
|
|
9
10
|
let redirectCount = 0;
|
|
@@ -18,7 +19,7 @@ export async function analyzeTransport(targetUrl, timeout) {
|
|
|
18
19
|
throw new Error(`Blocked: Redirect to internal/private IP prohibited (${currentUrl})`);
|
|
19
20
|
}
|
|
20
21
|
try {
|
|
21
|
-
const result = await executeRequest(currentUrl, timeout);
|
|
22
|
+
const result = await executeRequest(currentUrl, timeout, userAgent);
|
|
22
23
|
if (result.redirectUrl) {
|
|
23
24
|
redirectCount++;
|
|
24
25
|
totalRedirectTime += result.timings.total;
|
|
@@ -113,7 +114,7 @@ export async function analyzeTransport(targetUrl, timeout) {
|
|
|
113
114
|
}
|
|
114
115
|
throw new Error(`Too many redirects (limit: ${maxRedirects})`);
|
|
115
116
|
}
|
|
116
|
-
function executeRequest(urlStr, timeout) {
|
|
117
|
+
function executeRequest(urlStr, timeout, userAgent) {
|
|
117
118
|
return new Promise((resolve, reject) => {
|
|
118
119
|
let url;
|
|
119
120
|
try {
|
|
@@ -143,7 +144,7 @@ function executeRequest(urlStr, timeout) {
|
|
|
143
144
|
rejectUnauthorized: false,
|
|
144
145
|
agent: false,
|
|
145
146
|
headers: {
|
|
146
|
-
'User-Agent':
|
|
147
|
+
'User-Agent': userAgent || DEFAULTS.USER_AGENT,
|
|
147
148
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
148
149
|
'Accept-Encoding': 'gzip, deflate, br'
|
|
149
150
|
}
|
package/dist/audit/types.d.ts
CHANGED
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
export declare const DEFAULTS: {
|
|
2
|
+
readonly MAX_DEPTH: 5;
|
|
3
|
+
readonly MAX_DEPTH_LIMIT: 10;
|
|
4
|
+
readonly CONCURRENCY: 2;
|
|
5
|
+
readonly CONCURRENCY_LIMIT: 10;
|
|
6
|
+
readonly CRAWL_LIMIT: 500;
|
|
7
|
+
readonly USER_AGENT: `crawlith/${string}`;
|
|
8
|
+
readonly RATE_LIMIT: 10;
|
|
9
|
+
readonly MAX_BYTES: 2000000;
|
|
10
|
+
readonly MAX_REDIRECTS: 5;
|
|
11
|
+
readonly MAX_REDIRECTS_LIMIT: 11;
|
|
12
|
+
readonly HEADERS_TIMEOUT: 10000;
|
|
13
|
+
readonly BODY_TIMEOUT: 10000;
|
|
14
|
+
readonly MAX_SNAPSHOTS: 5;
|
|
15
|
+
readonly MAX_SINGLE_SNAPSHOTS: 5;
|
|
16
|
+
readonly GRAPH_PRECISION: 1e-12;
|
|
17
|
+
};
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
import { version } from './utils/version.js';
|
|
2
|
+
export const DEFAULTS = {
|
|
3
|
+
// Crawler defaults
|
|
4
|
+
MAX_DEPTH: 5,
|
|
5
|
+
MAX_DEPTH_LIMIT: 10,
|
|
6
|
+
CONCURRENCY: 2,
|
|
7
|
+
CONCURRENCY_LIMIT: 10,
|
|
8
|
+
CRAWL_LIMIT: 500,
|
|
9
|
+
// Network/Fetcher defaults
|
|
10
|
+
USER_AGENT: `crawlith/${version}`,
|
|
11
|
+
RATE_LIMIT: 10,
|
|
12
|
+
MAX_BYTES: 2000000, // 2MB
|
|
13
|
+
MAX_REDIRECTS: 5,
|
|
14
|
+
MAX_REDIRECTS_LIMIT: 11,
|
|
15
|
+
// Network timeouts
|
|
16
|
+
HEADERS_TIMEOUT: 10000,
|
|
17
|
+
BODY_TIMEOUT: 10000,
|
|
18
|
+
// Keep only last 5 snapshots
|
|
19
|
+
MAX_SNAPSHOTS: 5,
|
|
20
|
+
MAX_SINGLE_SNAPSHOTS: 5,
|
|
21
|
+
// Graph calculation precision
|
|
22
|
+
GRAPH_PRECISION: 1e-12
|
|
23
|
+
};
|
|
@@ -15,6 +15,9 @@ export class ScopeManager {
|
|
|
15
15
|
}));
|
|
16
16
|
}
|
|
17
17
|
isUrlEligible(url) {
|
|
18
|
+
// Root-relative paths (e.g. '/about', '/?q=foo') are always internal
|
|
19
|
+
if (url.startsWith('/'))
|
|
20
|
+
return 'allowed';
|
|
18
21
|
let hostname;
|
|
19
22
|
try {
|
|
20
23
|
hostname = new URL(url).hostname.toLowerCase();
|