@crawlith/core 0.1.0 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +201 -0
- package/README.md +70 -0
- package/dist/analysis/analysis_list.html +35 -0
- package/dist/analysis/analysis_page.html +123 -0
- package/dist/analysis/analyze.d.ts +40 -5
- package/dist/analysis/analyze.js +395 -347
- package/dist/analysis/clustering.d.ts +23 -0
- package/dist/analysis/clustering.js +206 -0
- package/dist/analysis/content.d.ts +1 -1
- package/dist/analysis/content.js +11 -5
- package/dist/analysis/duplicate.d.ts +34 -0
- package/dist/analysis/duplicate.js +305 -0
- package/dist/analysis/heading.d.ts +116 -0
- package/dist/analysis/heading.js +356 -0
- package/dist/analysis/images.d.ts +1 -1
- package/dist/analysis/images.js +6 -5
- package/dist/analysis/links.d.ts +1 -1
- package/dist/analysis/links.js +8 -8
- package/dist/{scoring/orphanSeverity.d.ts → analysis/orphan.d.ts} +12 -23
- package/dist/{scoring/orphanSeverity.js → analysis/orphan.js} +9 -3
- package/dist/analysis/scoring.js +11 -2
- package/dist/analysis/seo.d.ts +8 -4
- package/dist/analysis/seo.js +41 -30
- package/dist/analysis/soft404.d.ts +17 -0
- package/dist/analysis/soft404.js +62 -0
- package/dist/analysis/structuredData.d.ts +1 -1
- package/dist/analysis/structuredData.js +5 -4
- package/dist/analysis/templates.d.ts +2 -0
- package/dist/analysis/templates.js +7 -0
- package/dist/application/index.d.ts +2 -0
- package/dist/application/index.js +2 -0
- package/dist/application/usecase.d.ts +3 -0
- package/dist/application/usecase.js +1 -0
- package/dist/application/usecases.d.ts +114 -0
- package/dist/application/usecases.js +201 -0
- package/dist/audit/index.js +1 -1
- package/dist/audit/transport.d.ts +1 -1
- package/dist/audit/transport.js +5 -4
- package/dist/audit/types.d.ts +1 -0
- package/dist/constants.d.ts +17 -0
- package/dist/constants.js +23 -0
- package/dist/core/scope/scopeManager.js +3 -0
- package/dist/core/security/ipGuard.d.ts +11 -0
- package/dist/core/security/ipGuard.js +71 -3
- package/dist/crawler/crawl.d.ts +4 -22
- package/dist/crawler/crawl.js +4 -335
- package/dist/crawler/crawler.d.ts +87 -0
- package/dist/crawler/crawler.js +683 -0
- package/dist/crawler/extract.d.ts +4 -1
- package/dist/crawler/extract.js +7 -2
- package/dist/crawler/fetcher.d.ts +2 -1
- package/dist/crawler/fetcher.js +26 -11
- package/dist/crawler/metricsRunner.d.ts +23 -1
- package/dist/crawler/metricsRunner.js +202 -72
- package/dist/crawler/normalize.d.ts +41 -0
- package/dist/crawler/normalize.js +119 -3
- package/dist/crawler/parser.d.ts +1 -3
- package/dist/crawler/parser.js +2 -49
- package/dist/crawler/resolver.d.ts +11 -0
- package/dist/crawler/resolver.js +67 -0
- package/dist/crawler/sitemap.d.ts +6 -0
- package/dist/crawler/sitemap.js +27 -17
- package/dist/crawler/trap.d.ts +5 -1
- package/dist/crawler/trap.js +23 -2
- package/dist/db/CrawlithDB.d.ts +110 -0
- package/dist/db/CrawlithDB.js +500 -0
- package/dist/db/graphLoader.js +42 -30
- package/dist/db/index.d.ts +11 -0
- package/dist/db/index.js +41 -29
- package/dist/db/migrations.d.ts +2 -0
- package/dist/db/{schema.js → migrations.js} +90 -43
- package/dist/db/pluginRegistry.d.ts +9 -0
- package/dist/db/pluginRegistry.js +19 -0
- package/dist/db/repositories/EdgeRepository.d.ts +13 -0
- package/dist/db/repositories/EdgeRepository.js +20 -0
- package/dist/db/repositories/MetricsRepository.d.ts +16 -8
- package/dist/db/repositories/MetricsRepository.js +28 -7
- package/dist/db/repositories/PageRepository.d.ts +15 -2
- package/dist/db/repositories/PageRepository.js +169 -25
- package/dist/db/repositories/SiteRepository.d.ts +9 -0
- package/dist/db/repositories/SiteRepository.js +13 -0
- package/dist/db/repositories/SnapshotRepository.d.ts +14 -5
- package/dist/db/repositories/SnapshotRepository.js +64 -5
- package/dist/db/reset.d.ts +9 -0
- package/dist/db/reset.js +32 -0
- package/dist/db/statements.d.ts +12 -0
- package/dist/db/statements.js +40 -0
- package/dist/diff/compare.d.ts +0 -5
- package/dist/diff/compare.js +0 -12
- package/dist/diff/service.d.ts +16 -0
- package/dist/diff/service.js +41 -0
- package/dist/domain/index.d.ts +4 -0
- package/dist/domain/index.js +4 -0
- package/dist/events.d.ts +56 -0
- package/dist/events.js +1 -0
- package/dist/graph/graph.d.ts +36 -42
- package/dist/graph/graph.js +26 -17
- package/dist/graph/hits.d.ts +23 -0
- package/dist/graph/hits.js +111 -0
- package/dist/graph/metrics.d.ts +0 -4
- package/dist/graph/metrics.js +25 -9
- package/dist/graph/pagerank.d.ts +17 -4
- package/dist/graph/pagerank.js +126 -91
- package/dist/graph/simhash.d.ts +6 -0
- package/dist/graph/simhash.js +14 -0
- package/dist/index.d.ts +29 -8
- package/dist/index.js +29 -8
- package/dist/lock/hashKey.js +1 -1
- package/dist/lock/lockManager.d.ts +5 -1
- package/dist/lock/lockManager.js +38 -13
- package/dist/plugin-system/plugin-cli.d.ts +10 -0
- package/dist/plugin-system/plugin-cli.js +31 -0
- package/dist/plugin-system/plugin-config.d.ts +16 -0
- package/dist/plugin-system/plugin-config.js +36 -0
- package/dist/plugin-system/plugin-loader.d.ts +17 -0
- package/dist/plugin-system/plugin-loader.js +122 -0
- package/dist/plugin-system/plugin-registry.d.ts +25 -0
- package/dist/plugin-system/plugin-registry.js +167 -0
- package/dist/plugin-system/plugin-types.d.ts +205 -0
- package/dist/plugin-system/plugin-types.js +1 -0
- package/dist/ports/index.d.ts +9 -0
- package/dist/ports/index.js +1 -0
- package/{src/report/sitegraph_template.ts → dist/report/crawl.html} +330 -81
- package/dist/report/crawlExport.d.ts +3 -0
- package/dist/report/{sitegraphExport.js → crawlExport.js} +3 -3
- package/dist/report/crawl_template.d.ts +1 -0
- package/dist/report/crawl_template.js +7 -0
- package/dist/report/export.d.ts +3 -0
- package/dist/report/export.js +81 -0
- package/dist/report/html.js +15 -216
- package/dist/report/insight.d.ts +27 -0
- package/dist/report/insight.js +103 -0
- package/dist/scoring/health.d.ts +56 -0
- package/dist/scoring/health.js +213 -0
- package/dist/utils/chalk.d.ts +6 -0
- package/dist/utils/chalk.js +41 -0
- package/dist/utils/secureConfig.d.ts +23 -0
- package/dist/utils/secureConfig.js +128 -0
- package/package.json +12 -6
- package/CHANGELOG.md +0 -7
- package/dist/db/schema.d.ts +0 -2
- package/dist/graph/cluster.d.ts +0 -6
- package/dist/graph/cluster.js +0 -173
- package/dist/graph/duplicate.d.ts +0 -10
- package/dist/graph/duplicate.js +0 -251
- package/dist/report/sitegraphExport.d.ts +0 -3
- package/dist/report/sitegraph_template.d.ts +0 -1
- package/dist/report/sitegraph_template.js +0 -630
- package/dist/scoring/hits.d.ts +0 -9
- package/dist/scoring/hits.js +0 -111
- package/src/analysis/analyze.ts +0 -548
- package/src/analysis/content.ts +0 -62
- package/src/analysis/images.ts +0 -28
- package/src/analysis/links.ts +0 -41
- package/src/analysis/scoring.ts +0 -59
- package/src/analysis/seo.ts +0 -82
- package/src/analysis/structuredData.ts +0 -62
- package/src/audit/dns.ts +0 -49
- package/src/audit/headers.ts +0 -98
- package/src/audit/index.ts +0 -66
- package/src/audit/scoring.ts +0 -232
- package/src/audit/transport.ts +0 -258
- package/src/audit/types.ts +0 -102
- package/src/core/network/proxyAdapter.ts +0 -21
- package/src/core/network/rateLimiter.ts +0 -39
- package/src/core/network/redirectController.ts +0 -47
- package/src/core/network/responseLimiter.ts +0 -34
- package/src/core/network/retryPolicy.ts +0 -57
- package/src/core/scope/domainFilter.ts +0 -45
- package/src/core/scope/scopeManager.ts +0 -52
- package/src/core/scope/subdomainPolicy.ts +0 -39
- package/src/core/security/ipGuard.ts +0 -92
- package/src/crawler/crawl.ts +0 -382
- package/src/crawler/extract.ts +0 -34
- package/src/crawler/fetcher.ts +0 -233
- package/src/crawler/metricsRunner.ts +0 -124
- package/src/crawler/normalize.ts +0 -108
- package/src/crawler/parser.ts +0 -190
- package/src/crawler/sitemap.ts +0 -73
- package/src/crawler/trap.ts +0 -96
- package/src/db/graphLoader.ts +0 -105
- package/src/db/index.ts +0 -70
- package/src/db/repositories/EdgeRepository.ts +0 -29
- package/src/db/repositories/MetricsRepository.ts +0 -49
- package/src/db/repositories/PageRepository.ts +0 -128
- package/src/db/repositories/SiteRepository.ts +0 -32
- package/src/db/repositories/SnapshotRepository.ts +0 -74
- package/src/db/schema.ts +0 -177
- package/src/diff/compare.ts +0 -84
- package/src/graph/cluster.ts +0 -192
- package/src/graph/duplicate.ts +0 -286
- package/src/graph/graph.ts +0 -172
- package/src/graph/metrics.ts +0 -110
- package/src/graph/pagerank.ts +0 -125
- package/src/graph/simhash.ts +0 -61
- package/src/index.ts +0 -30
- package/src/lock/hashKey.ts +0 -51
- package/src/lock/lockManager.ts +0 -124
- package/src/lock/pidCheck.ts +0 -13
- package/src/report/html.ts +0 -227
- package/src/report/sitegraphExport.ts +0 -58
- package/src/scoring/hits.ts +0 -131
- package/src/scoring/orphanSeverity.ts +0 -176
- package/src/utils/version.ts +0 -18
- package/tests/__snapshots__/orphanSeverity.test.ts.snap +0 -49
- package/tests/analysis.unit.test.ts +0 -98
- package/tests/analyze.integration.test.ts +0 -98
- package/tests/audit/dns.test.ts +0 -31
- package/tests/audit/headers.test.ts +0 -45
- package/tests/audit/scoring.test.ts +0 -133
- package/tests/audit/security.test.ts +0 -12
- package/tests/audit/transport.test.ts +0 -112
- package/tests/clustering.test.ts +0 -118
- package/tests/crawler.test.ts +0 -358
- package/tests/db.test.ts +0 -159
- package/tests/diff.test.ts +0 -67
- package/tests/duplicate.test.ts +0 -110
- package/tests/fetcher.test.ts +0 -106
- package/tests/fetcher_safety.test.ts +0 -85
- package/tests/fixtures/analyze-crawl.json +0 -26
- package/tests/hits.test.ts +0 -134
- package/tests/html_report.test.ts +0 -58
- package/tests/lock/lockManager.test.ts +0 -138
- package/tests/metrics.test.ts +0 -196
- package/tests/normalize.test.ts +0 -101
- package/tests/orphanSeverity.test.ts +0 -160
- package/tests/pagerank.test.ts +0 -98
- package/tests/parser.test.ts +0 -117
- package/tests/proxy_safety.test.ts +0 -57
- package/tests/redirect_safety.test.ts +0 -73
- package/tests/safety.test.ts +0 -114
- package/tests/scope.test.ts +0 -66
- package/tests/scoring.test.ts +0 -59
- package/tests/sitemap.test.ts +0 -88
- package/tests/soft404.test.ts +0 -41
- package/tests/trap.test.ts +0 -39
- package/tests/visualization_data.test.ts +0 -46
- package/tsconfig.json +0 -11
package/dist/crawler/extract.js
CHANGED
|
@@ -2,8 +2,11 @@ import * as cheerio from 'cheerio';
|
|
|
2
2
|
/**
|
|
3
3
|
* Extracts all links from an HTML document.
|
|
4
4
|
* Returns absolute URLs.
|
|
5
|
+
* @param html The HTML content string
|
|
6
|
+
* @param baseUrl The base URL to resolve relative links against
|
|
7
|
+
* @param onError Optional callback for handling extraction errors
|
|
5
8
|
*/
|
|
6
|
-
export function extractLinks(html, baseUrl) {
|
|
9
|
+
export function extractLinks(html, baseUrl, onError) {
|
|
7
10
|
try {
|
|
8
11
|
const $ = cheerio.load(html);
|
|
9
12
|
const links = new Set();
|
|
@@ -27,7 +30,9 @@ export function extractLinks(html, baseUrl) {
|
|
|
27
30
|
return Array.from(links);
|
|
28
31
|
}
|
|
29
32
|
catch (e) {
|
|
30
|
-
|
|
33
|
+
if (onError) {
|
|
34
|
+
onError(e);
|
|
35
|
+
}
|
|
31
36
|
return [];
|
|
32
37
|
}
|
|
33
38
|
}
|
|
@@ -23,9 +23,10 @@ export interface FetchOptions {
|
|
|
23
23
|
crawlDelay?: number;
|
|
24
24
|
}
|
|
25
25
|
export declare class Fetcher {
|
|
26
|
-
|
|
26
|
+
userAgent: string;
|
|
27
27
|
private rateLimiter;
|
|
28
28
|
private proxyAdapter;
|
|
29
|
+
private secureDispatcher;
|
|
29
30
|
private scopeManager?;
|
|
30
31
|
private maxRedirects;
|
|
31
32
|
constructor(options?: {
|
package/dist/crawler/fetcher.js
CHANGED
|
@@ -1,26 +1,34 @@
|
|
|
1
1
|
import { request } from 'undici';
|
|
2
|
+
import * as net from 'net';
|
|
2
3
|
import { IPGuard } from '../core/security/ipGuard.js';
|
|
3
4
|
import { RateLimiter } from '../core/network/rateLimiter.js';
|
|
4
5
|
import { RetryPolicy } from '../core/network/retryPolicy.js';
|
|
5
6
|
import { ResponseLimiter } from '../core/network/responseLimiter.js';
|
|
6
7
|
import { RedirectController } from '../core/network/redirectController.js';
|
|
7
8
|
import { ProxyAdapter } from '../core/network/proxyAdapter.js';
|
|
8
|
-
import {
|
|
9
|
+
import { DEFAULTS } from '../constants.js';
|
|
9
10
|
export class Fetcher {
|
|
10
|
-
userAgent =
|
|
11
|
+
userAgent = DEFAULTS.USER_AGENT;
|
|
11
12
|
rateLimiter;
|
|
12
13
|
proxyAdapter;
|
|
14
|
+
secureDispatcher;
|
|
13
15
|
scopeManager;
|
|
14
16
|
maxRedirects;
|
|
15
17
|
constructor(options = {}) {
|
|
16
|
-
this.rateLimiter = new RateLimiter(options.rate ||
|
|
18
|
+
this.rateLimiter = new RateLimiter(options.rate || DEFAULTS.RATE_LIMIT);
|
|
17
19
|
this.proxyAdapter = new ProxyAdapter(options.proxyUrl);
|
|
20
|
+
if (this.proxyAdapter.dispatcher) {
|
|
21
|
+
this.secureDispatcher = this.proxyAdapter.dispatcher;
|
|
22
|
+
}
|
|
23
|
+
else {
|
|
24
|
+
this.secureDispatcher = IPGuard.getSecureDispatcher();
|
|
25
|
+
}
|
|
18
26
|
this.scopeManager = options.scopeManager;
|
|
19
|
-
this.maxRedirects = Math.min(options.maxRedirects ??
|
|
20
|
-
this.userAgent = options.userAgent ||
|
|
27
|
+
this.maxRedirects = Math.min(options.maxRedirects ?? DEFAULTS.MAX_REDIRECTS, DEFAULTS.MAX_REDIRECTS_LIMIT);
|
|
28
|
+
this.userAgent = options.userAgent || DEFAULTS.USER_AGENT;
|
|
21
29
|
}
|
|
22
30
|
async fetch(url, options = {}) {
|
|
23
|
-
const maxBytes = options.maxBytes ||
|
|
31
|
+
const maxBytes = options.maxBytes || DEFAULTS.MAX_BYTES;
|
|
24
32
|
const redirectChain = [];
|
|
25
33
|
const redirectController = new RedirectController(this.maxRedirects, url);
|
|
26
34
|
let currentUrl = url;
|
|
@@ -28,10 +36,14 @@ export class Fetcher {
|
|
|
28
36
|
// Use a while(true) and explicit return/continue to handle redirects
|
|
29
37
|
while (true) {
|
|
30
38
|
const urlObj = new URL(currentUrl);
|
|
31
|
-
// 1. SSRF Guard
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
39
|
+
// 1. SSRF Guard (IP Literals only)
|
|
40
|
+
// We only check explicit IP literals here to fail fast.
|
|
41
|
+
// For domains, we rely on the secureDispatcher (which uses IPGuard.secureLookup)
|
|
42
|
+
// to resolve and validate the IP at connection time, preventing TOCTOU attacks.
|
|
43
|
+
if (net.isIP(urlObj.hostname)) {
|
|
44
|
+
if (IPGuard.isInternal(urlObj.hostname)) {
|
|
45
|
+
return this.errorResult('blocked_internal_ip', currentUrl, redirectChain, totalRetries);
|
|
46
|
+
}
|
|
35
47
|
}
|
|
36
48
|
// 2. Scope Validation (Domain & Subdomain)
|
|
37
49
|
if (this.scopeManager) {
|
|
@@ -61,7 +73,7 @@ export class Fetcher {
|
|
|
61
73
|
method: 'GET',
|
|
62
74
|
headers,
|
|
63
75
|
maxRedirections: 0,
|
|
64
|
-
dispatcher: this.
|
|
76
|
+
dispatcher: this.secureDispatcher,
|
|
65
77
|
headersTimeout: 10000,
|
|
66
78
|
bodyTimeout: 10000
|
|
67
79
|
});
|
|
@@ -141,6 +153,9 @@ export class Fetcher {
|
|
|
141
153
|
catch (error) {
|
|
142
154
|
// Map common network errors to specific statuses if needed
|
|
143
155
|
const isProxyError = error.message?.toLowerCase().includes('proxy') || error.code === 'ECONNREFUSED';
|
|
156
|
+
if (error.code === 'EBLOCKED' || error.message?.includes('Blocked internal IP')) {
|
|
157
|
+
return this.errorResult('blocked_internal_ip', currentUrl, redirectChain, totalRetries);
|
|
158
|
+
}
|
|
144
159
|
const finalStatus = isProxyError ? 'proxy_connection_failed' : 'network_error';
|
|
145
160
|
return this.errorResult(totalRetries >= RetryPolicy.DEFAULT_CONFIG.maxRetries ? 'failed_after_retries' : finalStatus, currentUrl, redirectChain, totalRetries);
|
|
146
161
|
}
|
|
@@ -1 +1,23 @@
|
|
|
1
|
-
|
|
1
|
+
import { EngineContext } from '../events.js';
|
|
2
|
+
import { Graph } from '../graph/graph.js';
|
|
3
|
+
export interface PostCrawlOptions {
|
|
4
|
+
context?: EngineContext;
|
|
5
|
+
limitReached?: boolean;
|
|
6
|
+
graphInstance?: Graph;
|
|
7
|
+
clustering?: boolean;
|
|
8
|
+
clusterThreshold?: number;
|
|
9
|
+
minClusterSize?: number;
|
|
10
|
+
health?: boolean;
|
|
11
|
+
computePagerank?: boolean;
|
|
12
|
+
computeHits?: boolean;
|
|
13
|
+
heading?: boolean;
|
|
14
|
+
orphans?: boolean;
|
|
15
|
+
orphanSeverity?: 'low' | 'medium' | 'high' | boolean;
|
|
16
|
+
includeSoftOrphans?: boolean;
|
|
17
|
+
minInbound?: number;
|
|
18
|
+
rootOrigin?: string;
|
|
19
|
+
}
|
|
20
|
+
export declare function runPostCrawlMetrics(snapshotId: number, maxDepth: number, options?: PostCrawlOptions): {
|
|
21
|
+
metrics: any;
|
|
22
|
+
healthData?: any;
|
|
23
|
+
} | undefined;
|
|
@@ -3,51 +3,189 @@ import { loadGraphFromSnapshot } from '../db/graphLoader.js';
|
|
|
3
3
|
import { MetricsRepository } from '../db/repositories/MetricsRepository.js';
|
|
4
4
|
import { SnapshotRepository } from '../db/repositories/SnapshotRepository.js';
|
|
5
5
|
import { PageRepository } from '../db/repositories/PageRepository.js';
|
|
6
|
-
import { computePageRank } from '../graph/pagerank.js';
|
|
7
6
|
import { calculateMetrics } from '../graph/metrics.js';
|
|
8
|
-
import {
|
|
9
|
-
|
|
7
|
+
import { PageRankService } from '../graph/pagerank.js';
|
|
8
|
+
import { HITSService } from '../graph/hits.js';
|
|
9
|
+
import { TrapDetector } from './trap.js';
|
|
10
|
+
import { ClusteringService } from '../analysis/clustering.js';
|
|
11
|
+
import { DuplicateService } from '../analysis/duplicate.js';
|
|
12
|
+
import { annotateOrphans } from '../analysis/orphan.js';
|
|
13
|
+
import { Soft404Service } from '../analysis/soft404.js';
|
|
14
|
+
import { HeadingHealthService } from '../analysis/heading.js';
|
|
15
|
+
import { HealthService } from '../scoring/health.js';
|
|
16
|
+
import { analyzeContent } from '../analysis/content.js';
|
|
17
|
+
import { load } from 'cheerio';
|
|
18
|
+
export function runPostCrawlMetrics(snapshotId, maxDepth, options = {}) {
|
|
19
|
+
const context = options.context;
|
|
20
|
+
const limitReached = options.limitReached || false;
|
|
21
|
+
const graphInstance = options.graphInstance;
|
|
10
22
|
const db = getDb();
|
|
11
23
|
const metricsRepo = new MetricsRepository(db);
|
|
12
24
|
const snapshotRepo = new SnapshotRepository(db);
|
|
13
25
|
const pageRepo = new PageRepository(db);
|
|
26
|
+
const graph = graphInstance || loadGraphFromSnapshot(snapshotId);
|
|
27
|
+
// Fallback emitter
|
|
28
|
+
const emit = (event) => {
|
|
29
|
+
if (context) {
|
|
30
|
+
context.emit(event);
|
|
31
|
+
}
|
|
32
|
+
else {
|
|
33
|
+
if (event.type === 'error')
|
|
34
|
+
console.error(event.message);
|
|
35
|
+
else if (event.type !== 'debug') {
|
|
36
|
+
const out = event.message || event.phase;
|
|
37
|
+
if (out)
|
|
38
|
+
console.log(out);
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
};
|
|
14
42
|
const snapshot = snapshotRepo.getSnapshot(snapshotId);
|
|
15
43
|
if (!snapshot) {
|
|
16
|
-
|
|
17
|
-
return;
|
|
44
|
+
emit({ type: 'error', message: `Snapshot ${snapshotId} not found` });
|
|
45
|
+
return undefined;
|
|
18
46
|
}
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
const
|
|
33
|
-
|
|
34
|
-
snapshot_id
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
duplicate_type: node.duplicateType ?? null,
|
|
48
|
-
is_cluster_primary: node.isClusterPrimary ? 1 : 0
|
|
47
|
+
if (!graphInstance) {
|
|
48
|
+
emit({ type: 'metrics:start', phase: 'Loading graph' });
|
|
49
|
+
}
|
|
50
|
+
emit({ type: 'metrics:start', phase: 'Running core algorithms' });
|
|
51
|
+
// 1. Graph Algorithms
|
|
52
|
+
const prResults = options.computePagerank ? new PageRankService().evaluate(graph) : new Map();
|
|
53
|
+
const hitsResults = options.computeHits ? new HITSService().evaluate(graph, { iterations: 20 }) : new Map();
|
|
54
|
+
// 2. Crawler Safety
|
|
55
|
+
new TrapDetector().analyze(graph);
|
|
56
|
+
// 3. Analysis / Intelligence
|
|
57
|
+
if (options.clustering) {
|
|
58
|
+
const contentClusters = new ClusteringService().detectContentClusters(graph, options.clusterThreshold, options.minClusterSize);
|
|
59
|
+
if (contentClusters.length > 0) {
|
|
60
|
+
const insertCluster = db.prepare(`
|
|
61
|
+
INSERT OR REPLACE INTO content_clusters (id, snapshot_id, count, primary_url, risk, shared_path_prefix)
|
|
62
|
+
VALUES (@id, @snapshot_id, @count, @primary_url, @risk, @shared_path_prefix)
|
|
63
|
+
`);
|
|
64
|
+
const insertContentTx = db.transaction((clusters) => {
|
|
65
|
+
for (const c of clusters) {
|
|
66
|
+
insertCluster.run({
|
|
67
|
+
id: c.id,
|
|
68
|
+
snapshot_id: snapshotId,
|
|
69
|
+
count: c.count,
|
|
70
|
+
primary_url: c.primaryUrl,
|
|
71
|
+
risk: c.risk,
|
|
72
|
+
shared_path_prefix: c.sharedPathPrefix ?? null
|
|
73
|
+
});
|
|
74
|
+
}
|
|
49
75
|
});
|
|
50
|
-
|
|
76
|
+
insertContentTx(contentClusters);
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
const duplicateClusters = new DuplicateService().detectDuplicates(graph, { collapse: false });
|
|
80
|
+
if (duplicateClusters.length > 0) {
|
|
81
|
+
const insertCluster = db.prepare(`
|
|
82
|
+
INSERT OR REPLACE INTO duplicate_clusters (id, snapshot_id, type, size, representative, severity)
|
|
83
|
+
VALUES (@id, @snapshot_id, @type, @size, @representative, @severity)
|
|
84
|
+
`);
|
|
85
|
+
const insertDuplicateTx = db.transaction((clusters) => {
|
|
86
|
+
for (const c of clusters) {
|
|
87
|
+
insertCluster.run({
|
|
88
|
+
id: c.id,
|
|
89
|
+
snapshot_id: snapshotId,
|
|
90
|
+
type: c.type, // valid: 'exact' | 'near' | 'template_heavy'
|
|
91
|
+
size: c.size,
|
|
92
|
+
representative: c.representative,
|
|
93
|
+
severity: c.severity || 'low'
|
|
94
|
+
});
|
|
95
|
+
}
|
|
96
|
+
});
|
|
97
|
+
insertDuplicateTx(duplicateClusters);
|
|
98
|
+
}
|
|
99
|
+
let annotatedNodes = [];
|
|
100
|
+
if (options.orphans) {
|
|
101
|
+
const orphanOptions = {
|
|
102
|
+
enabled: true,
|
|
103
|
+
severityEnabled: !!options.orphanSeverity || options.orphanSeverity === undefined,
|
|
104
|
+
includeSoftOrphans: options.includeSoftOrphans ?? true,
|
|
105
|
+
minInbound: options.minInbound ?? 2
|
|
106
|
+
};
|
|
107
|
+
annotatedNodes = annotateOrphans(graph.getNodes(), graph.getEdges(), orphanOptions);
|
|
108
|
+
}
|
|
109
|
+
const soft404Service = new Soft404Service();
|
|
110
|
+
const headingService = new HeadingHealthService();
|
|
111
|
+
// Pre-calculate heading health for all nodes with HTML
|
|
112
|
+
let headingPayloads = new Map();
|
|
113
|
+
if (options.heading) {
|
|
114
|
+
const result = headingService.evaluateNodes(graph.getNodes());
|
|
115
|
+
headingPayloads = result.payloadsByUrl;
|
|
116
|
+
}
|
|
117
|
+
// Apply signals to nodes
|
|
118
|
+
for (const node of graph.getNodes()) {
|
|
119
|
+
const pr = prResults.get(node.url);
|
|
120
|
+
if (pr)
|
|
121
|
+
node.pagerankScore = pr.score;
|
|
122
|
+
const hits = hitsResults.get(node.url);
|
|
123
|
+
if (hits) {
|
|
124
|
+
node.authScore = hits.authority_score;
|
|
125
|
+
node.hubScore = hits.hub_score;
|
|
126
|
+
node.linkRole = hits.link_role;
|
|
127
|
+
}
|
|
128
|
+
if (options.orphans) {
|
|
129
|
+
const annotated = annotatedNodes.find((n) => n.url === node.url);
|
|
130
|
+
if (annotated) {
|
|
131
|
+
node.orphanScore = annotated.orphanSeverity;
|
|
132
|
+
node.orphanType = annotated.orphanType;
|
|
133
|
+
node.impactLevel = annotated.impactLevel;
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
if (options.heading) {
|
|
137
|
+
const heading = headingPayloads.get(node.url);
|
|
138
|
+
if (heading) {
|
|
139
|
+
node.headingScore = heading.score;
|
|
140
|
+
node.headingData = JSON.stringify(heading);
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
if (node.html) {
|
|
144
|
+
const soft404 = soft404Service.analyze(node.html, node.outLinks);
|
|
145
|
+
node.soft404Score = soft404.score;
|
|
146
|
+
const $ = load(node.html);
|
|
147
|
+
const content = analyzeContent($);
|
|
148
|
+
node.wordCount = content.wordCount;
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
emit({ type: 'metrics:start', phase: 'Updating metrics in DB' });
|
|
152
|
+
// Pre-fetch all page IDs to avoid N+1 queries
|
|
153
|
+
const pagesIdentity = pageRepo.getPagesIdentityBySnapshot(snapshotId);
|
|
154
|
+
const urlToId = new Map();
|
|
155
|
+
for (const p of pagesIdentity) {
|
|
156
|
+
urlToId.set(p.normalized_url, p.id);
|
|
157
|
+
}
|
|
158
|
+
const metricsToSave = graph.getNodes().map(node => {
|
|
159
|
+
const pageId = urlToId.get(node.url);
|
|
160
|
+
if (!pageId)
|
|
161
|
+
return null;
|
|
162
|
+
return {
|
|
163
|
+
snapshot_id: snapshotId,
|
|
164
|
+
page_id: pageId,
|
|
165
|
+
crawl_status: node.crawlStatus ?? null,
|
|
166
|
+
word_count: node.wordCount ?? null,
|
|
167
|
+
thin_content_score: node.thinContentScore ?? null,
|
|
168
|
+
external_link_ratio: node.externalLinkRatio ?? null,
|
|
169
|
+
pagerank_score: node.pagerankScore ?? null,
|
|
170
|
+
hub_score: node.hubScore ?? null,
|
|
171
|
+
auth_score: node.authScore ?? null,
|
|
172
|
+
link_role: node.linkRole ?? null,
|
|
173
|
+
duplicate_cluster_id: node.duplicateClusterId ?? null,
|
|
174
|
+
duplicate_type: node.duplicateType ?? null,
|
|
175
|
+
cluster_id: node.clusterId ?? null,
|
|
176
|
+
soft404_score: node.soft404Score ?? null,
|
|
177
|
+
heading_score: node.headingScore ?? null,
|
|
178
|
+
orphan_score: node.orphanScore ?? null,
|
|
179
|
+
orphan_type: node.orphanType ?? null,
|
|
180
|
+
impact_level: node.impactLevel ?? null,
|
|
181
|
+
heading_data: node.headingData ?? null,
|
|
182
|
+
is_cluster_primary: node.isClusterPrimary ? 1 : 0
|
|
183
|
+
};
|
|
184
|
+
}).filter(m => m !== null);
|
|
185
|
+
metricsRepo.insertMany(metricsToSave);
|
|
186
|
+
// Update page-level metadata in transaction
|
|
187
|
+
const tx = db.transaction(() => {
|
|
188
|
+
for (const node of graph.getNodes()) {
|
|
51
189
|
if (node.crawlTrapFlag || node.redirectChain?.length || node.bytesReceived) {
|
|
52
190
|
pageRepo.upsertPage({
|
|
53
191
|
site_id: snapshot.site_id,
|
|
@@ -61,48 +199,40 @@ export function runPostCrawlMetrics(snapshotId, maxDepth, limitReached = false)
|
|
|
61
199
|
});
|
|
62
200
|
}
|
|
63
201
|
}
|
|
64
|
-
// Save duplicate clusters
|
|
65
|
-
if (graph.duplicateClusters.length > 0) {
|
|
66
|
-
const clusterStmt = db.prepare(`
|
|
67
|
-
INSERT OR REPLACE INTO duplicate_clusters (id, snapshot_id, type, size, representative, severity)
|
|
68
|
-
VALUES (?, ?, ?, ?, ?, ?)
|
|
69
|
-
`);
|
|
70
|
-
for (const cluster of graph.duplicateClusters) {
|
|
71
|
-
clusterStmt.run(cluster.id, snapshotId, cluster.type, cluster.size, cluster.representative, cluster.severity);
|
|
72
|
-
}
|
|
73
|
-
}
|
|
74
|
-
// Save content clusters
|
|
75
|
-
if (graph.contentClusters.length > 0) {
|
|
76
|
-
const contentStmt = db.prepare(`
|
|
77
|
-
INSERT OR REPLACE INTO content_clusters (id, snapshot_id, count, primary_url, risk, shared_path_prefix)
|
|
78
|
-
VALUES (?, ?, ?, ?, ?, ?)
|
|
79
|
-
`);
|
|
80
|
-
for (const cluster of graph.contentClusters) {
|
|
81
|
-
contentStmt.run(cluster.id, snapshotId, cluster.count, cluster.primaryUrl, cluster.risk, cluster.sharedPathPrefix ?? null);
|
|
82
|
-
}
|
|
83
|
-
}
|
|
84
202
|
});
|
|
85
203
|
tx();
|
|
86
|
-
|
|
204
|
+
emit({ type: 'metrics:start', phase: 'Computing aggregate stats' });
|
|
87
205
|
const metrics = calculateMetrics(graph, maxDepth);
|
|
88
|
-
|
|
89
|
-
let
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
206
|
+
// Compute health score if enabled
|
|
207
|
+
let healthScore = null;
|
|
208
|
+
if (options.health) {
|
|
209
|
+
try {
|
|
210
|
+
const rootOrigin = options.rootOrigin ?? '';
|
|
211
|
+
const healthService = new HealthService();
|
|
212
|
+
const issues = healthService.collectCrawlIssues(graph, metrics, rootOrigin);
|
|
213
|
+
const breakdown = healthService.calculateHealthScore(metrics.totalPages, issues);
|
|
214
|
+
healthScore = breakdown.score;
|
|
215
|
+
}
|
|
216
|
+
catch (e) {
|
|
217
|
+
emit({ type: 'error', message: 'Error computing health score', error: e });
|
|
218
|
+
}
|
|
96
219
|
}
|
|
97
|
-
const
|
|
98
|
-
const
|
|
220
|
+
const thinContentCount = graph.getNodes().filter(n => n.wordCount !== undefined && n.wordCount < 200 && n.status === 200).length;
|
|
221
|
+
const orphanCount = metrics.orphanPages.length;
|
|
99
222
|
snapshotRepo.updateSnapshotStatus(snapshotId, 'completed', {
|
|
100
223
|
node_count: metrics.totalPages,
|
|
101
224
|
edge_count: metrics.totalEdges,
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
225
|
+
limit_reached: limitReached ? 1 : 0,
|
|
226
|
+
thin_content_count: thinContentCount,
|
|
227
|
+
orphan_count: orphanCount,
|
|
228
|
+
...(healthScore !== null ? { health_score: healthScore } : {})
|
|
106
229
|
});
|
|
107
|
-
|
|
230
|
+
emit({ type: 'metrics:complete', durationMs: 0 });
|
|
231
|
+
return {
|
|
232
|
+
metrics,
|
|
233
|
+
healthData: healthScore !== null ? {
|
|
234
|
+
health: new HealthService().calculateHealthScore(metrics.totalPages, new HealthService().collectCrawlIssues(graph, metrics, options.rootOrigin ?? '')),
|
|
235
|
+
issues: new HealthService().collectCrawlIssues(graph, metrics, options.rootOrigin ?? '')
|
|
236
|
+
} : undefined
|
|
237
|
+
};
|
|
108
238
|
}
|
|
@@ -3,5 +3,46 @@
|
|
|
3
3
|
*/
|
|
4
4
|
export interface NormalizeOptions {
|
|
5
5
|
stripQuery?: boolean;
|
|
6
|
+
toPath?: boolean;
|
|
6
7
|
}
|
|
7
8
|
export declare function normalizeUrl(input: string, base: string, options?: NormalizeOptions): string | null;
|
|
9
|
+
/**
|
|
10
|
+
* Utility for converting between absolute URLs and relative paths
|
|
11
|
+
* primarily used for database storage.
|
|
12
|
+
*/
|
|
13
|
+
export declare class UrlUtil {
|
|
14
|
+
/**
|
|
15
|
+
* Extract a stable domain key from a URL/domain input.
|
|
16
|
+
* Examples:
|
|
17
|
+
* - "https://www.example.com/a" -> "example.com"
|
|
18
|
+
* - "example.com" -> "example.com"
|
|
19
|
+
*/
|
|
20
|
+
static extractDomain(input: string): string;
|
|
21
|
+
/**
|
|
22
|
+
* Resolve a site's absolute origin from persisted site fields.
|
|
23
|
+
*/
|
|
24
|
+
static resolveSiteOrigin(site: {
|
|
25
|
+
domain: string;
|
|
26
|
+
preferred_url?: string | null;
|
|
27
|
+
ssl?: number | null;
|
|
28
|
+
}): string;
|
|
29
|
+
/**
|
|
30
|
+
* Converts a full URL to a root-relative path if it matches the origin.
|
|
31
|
+
* If it doesn't match the origin, it's considered external and kept absolute.
|
|
32
|
+
*/
|
|
33
|
+
static toPath(urlStr: string, origin: string): string;
|
|
34
|
+
/**
|
|
35
|
+
* Converts a root-relative path back to an absolute URL relative to the origin.
|
|
36
|
+
* If the input is already an absolute URL, it is returned as-is.
|
|
37
|
+
*/
|
|
38
|
+
static toAbsolute(pathOrUrl: string, origin: string): string;
|
|
39
|
+
/**
|
|
40
|
+
* Determines if a URL (or path) is internal relative to the origin.
|
|
41
|
+
*/
|
|
42
|
+
static isInternal(pathOrUrl: string, origin: string): boolean;
|
|
43
|
+
/**
|
|
44
|
+
* Build normalized lookup candidates for querying pages table.
|
|
45
|
+
* Returns path/absolute/original variants in priority order, deduplicated.
|
|
46
|
+
*/
|
|
47
|
+
static toLookupCandidates(input: string, origin: string): string[];
|
|
48
|
+
}
|
|
@@ -10,7 +10,7 @@ const TRACKING_PARAMS = new Set([
|
|
|
10
10
|
]);
|
|
11
11
|
const SKIP_EXTENSIONS = new Set([
|
|
12
12
|
'.pdf', '.jpg', '.png', '.svg', '.webp', '.gif',
|
|
13
|
-
'.zip', '.xml', '.json', '.mp4'
|
|
13
|
+
'.zip', '.xml', '.json', '.mp4', '.avif', '.ics'
|
|
14
14
|
]);
|
|
15
15
|
export function normalizeUrl(input, base, options = {}) {
|
|
16
16
|
try {
|
|
@@ -71,6 +71,7 @@ export function normalizeUrl(input, base, options = {}) {
|
|
|
71
71
|
pathname = pathname.slice(0, -1);
|
|
72
72
|
}
|
|
73
73
|
u.pathname = pathname;
|
|
74
|
+
const finalUrl = u.toString();
|
|
74
75
|
// 9. Skip non-HTML assets by extension
|
|
75
76
|
const lastDotIndex = u.pathname.lastIndexOf('.');
|
|
76
77
|
if (lastDotIndex !== -1) {
|
|
@@ -79,10 +80,125 @@ export function normalizeUrl(input, base, options = {}) {
|
|
|
79
80
|
return null;
|
|
80
81
|
}
|
|
81
82
|
}
|
|
82
|
-
// 10. Return
|
|
83
|
-
|
|
83
|
+
// 10. Return path if requested
|
|
84
|
+
if (options.toPath) {
|
|
85
|
+
return u.pathname + u.search;
|
|
86
|
+
}
|
|
87
|
+
// 11. Return final string
|
|
88
|
+
return finalUrl;
|
|
84
89
|
}
|
|
85
90
|
catch (_e) {
|
|
86
91
|
return null;
|
|
87
92
|
}
|
|
88
93
|
}
|
|
94
|
+
/**
|
|
95
|
+
* Utility for converting between absolute URLs and relative paths
|
|
96
|
+
* primarily used for database storage.
|
|
97
|
+
*/
|
|
98
|
+
export class UrlUtil {
|
|
99
|
+
/**
|
|
100
|
+
* Extract a stable domain key from a URL/domain input.
|
|
101
|
+
* Examples:
|
|
102
|
+
* - "https://www.example.com/a" -> "example.com"
|
|
103
|
+
* - "example.com" -> "example.com"
|
|
104
|
+
*/
|
|
105
|
+
static extractDomain(input) {
|
|
106
|
+
const trimmed = input.trim();
|
|
107
|
+
if (!trimmed)
|
|
108
|
+
return '';
|
|
109
|
+
try {
|
|
110
|
+
const direct = new URL(trimmed);
|
|
111
|
+
return direct.hostname.toLowerCase().replace(/^www\./, '');
|
|
112
|
+
}
|
|
113
|
+
catch {
|
|
114
|
+
// fall through
|
|
115
|
+
}
|
|
116
|
+
try {
|
|
117
|
+
const withProtocol = new URL(`https://${trimmed}`);
|
|
118
|
+
return withProtocol.hostname.toLowerCase().replace(/^www\./, '');
|
|
119
|
+
}
|
|
120
|
+
catch {
|
|
121
|
+
return trimmed.toLowerCase().replace(/^www\./, '');
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
/**
|
|
125
|
+
* Resolve a site's absolute origin from persisted site fields.
|
|
126
|
+
*/
|
|
127
|
+
static resolveSiteOrigin(site) {
|
|
128
|
+
if (site.preferred_url) {
|
|
129
|
+
try {
|
|
130
|
+
return new URL(site.preferred_url).origin;
|
|
131
|
+
}
|
|
132
|
+
catch {
|
|
133
|
+
// fall through to domain+ssl fallback
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
const protocol = site.ssl === 0 ? 'http' : 'https';
|
|
137
|
+
return `${protocol}://${site.domain}`;
|
|
138
|
+
}
|
|
139
|
+
/**
|
|
140
|
+
* Converts a full URL to a root-relative path if it matches the origin.
|
|
141
|
+
* If it doesn't match the origin, it's considered external and kept absolute.
|
|
142
|
+
*/
|
|
143
|
+
static toPath(urlStr, origin) {
|
|
144
|
+
try {
|
|
145
|
+
const url = new URL(urlStr);
|
|
146
|
+
const originUrl = new URL(origin);
|
|
147
|
+
if (url.origin === originUrl.origin) {
|
|
148
|
+
return url.pathname + url.search;
|
|
149
|
+
}
|
|
150
|
+
return urlStr;
|
|
151
|
+
}
|
|
152
|
+
catch {
|
|
153
|
+
return urlStr;
|
|
154
|
+
}
|
|
155
|
+
}
|
|
156
|
+
/**
|
|
157
|
+
* Converts a root-relative path back to an absolute URL relative to the origin.
|
|
158
|
+
* If the input is already an absolute URL, it is returned as-is.
|
|
159
|
+
*/
|
|
160
|
+
static toAbsolute(pathOrUrl, origin) {
|
|
161
|
+
if (pathOrUrl.startsWith('http://') || pathOrUrl.startsWith('https://')) {
|
|
162
|
+
return pathOrUrl;
|
|
163
|
+
}
|
|
164
|
+
try {
|
|
165
|
+
return new URL(pathOrUrl, origin).toString();
|
|
166
|
+
}
|
|
167
|
+
catch {
|
|
168
|
+
return pathOrUrl;
|
|
169
|
+
}
|
|
170
|
+
}
|
|
171
|
+
/**
|
|
172
|
+
* Determines if a URL (or path) is internal relative to the origin.
|
|
173
|
+
*/
|
|
174
|
+
static isInternal(pathOrUrl, origin) {
|
|
175
|
+
if (!pathOrUrl.startsWith('http'))
|
|
176
|
+
return true;
|
|
177
|
+
try {
|
|
178
|
+
const url = new URL(pathOrUrl);
|
|
179
|
+
const originUrl = new URL(origin);
|
|
180
|
+
return url.origin === originUrl.origin;
|
|
181
|
+
}
|
|
182
|
+
catch {
|
|
183
|
+
return false;
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
/**
|
|
187
|
+
* Build normalized lookup candidates for querying pages table.
|
|
188
|
+
* Returns path/absolute/original variants in priority order, deduplicated.
|
|
189
|
+
*/
|
|
190
|
+
static toLookupCandidates(input, origin) {
|
|
191
|
+
const candidates = new Set();
|
|
192
|
+
const raw = input.trim();
|
|
193
|
+
if (!raw)
|
|
194
|
+
return [];
|
|
195
|
+
const absolute = normalizeUrl(raw, origin, { stripQuery: false }) || UrlUtil.toAbsolute(raw, origin);
|
|
196
|
+
const path = normalizeUrl(raw, origin, { stripQuery: false, toPath: true }) || UrlUtil.toPath(raw, origin);
|
|
197
|
+
const absolutePath = normalizeUrl(absolute, '', { stripQuery: false, toPath: true }) || UrlUtil.toPath(absolute, origin);
|
|
198
|
+
candidates.add(path);
|
|
199
|
+
candidates.add(absolute);
|
|
200
|
+
candidates.add(absolutePath);
|
|
201
|
+
candidates.add(raw);
|
|
202
|
+
return Array.from(candidates).filter(Boolean);
|
|
203
|
+
}
|
|
204
|
+
}
|
package/dist/crawler/parser.d.ts
CHANGED
|
@@ -11,12 +11,10 @@ export interface ParseResult {
|
|
|
11
11
|
contentHash: string;
|
|
12
12
|
simhash?: string;
|
|
13
13
|
uniqueTokenRatio?: number;
|
|
14
|
-
soft404Score: number;
|
|
15
|
-
soft404Signals: string[];
|
|
16
14
|
}
|
|
17
15
|
export declare class Parser {
|
|
18
16
|
/**
|
|
19
17
|
* Parses HTML content to extract metadata and links.
|
|
20
18
|
*/
|
|
21
|
-
parse(html: string, baseUrl: string,
|
|
19
|
+
parse(html: string, baseUrl: string, _status: number): ParseResult;
|
|
22
20
|
}
|