@crawlith/core 0.1.0 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +201 -0
- package/README.md +70 -0
- package/dist/analysis/analysis_list.html +35 -0
- package/dist/analysis/analysis_page.html +123 -0
- package/dist/analysis/analyze.d.ts +40 -5
- package/dist/analysis/analyze.js +395 -347
- package/dist/analysis/clustering.d.ts +23 -0
- package/dist/analysis/clustering.js +206 -0
- package/dist/analysis/content.d.ts +1 -1
- package/dist/analysis/content.js +11 -5
- package/dist/analysis/duplicate.d.ts +34 -0
- package/dist/analysis/duplicate.js +305 -0
- package/dist/analysis/heading.d.ts +116 -0
- package/dist/analysis/heading.js +356 -0
- package/dist/analysis/images.d.ts +1 -1
- package/dist/analysis/images.js +6 -5
- package/dist/analysis/links.d.ts +1 -1
- package/dist/analysis/links.js +8 -8
- package/dist/{scoring/orphanSeverity.d.ts → analysis/orphan.d.ts} +12 -23
- package/dist/{scoring/orphanSeverity.js → analysis/orphan.js} +9 -3
- package/dist/analysis/scoring.js +11 -2
- package/dist/analysis/seo.d.ts +8 -4
- package/dist/analysis/seo.js +41 -30
- package/dist/analysis/soft404.d.ts +17 -0
- package/dist/analysis/soft404.js +62 -0
- package/dist/analysis/structuredData.d.ts +1 -1
- package/dist/analysis/structuredData.js +5 -4
- package/dist/analysis/templates.d.ts +2 -0
- package/dist/analysis/templates.js +7 -0
- package/dist/application/index.d.ts +2 -0
- package/dist/application/index.js +2 -0
- package/dist/application/usecase.d.ts +3 -0
- package/dist/application/usecase.js +1 -0
- package/dist/application/usecases.d.ts +114 -0
- package/dist/application/usecases.js +201 -0
- package/dist/audit/index.js +1 -1
- package/dist/audit/transport.d.ts +1 -1
- package/dist/audit/transport.js +5 -4
- package/dist/audit/types.d.ts +1 -0
- package/dist/constants.d.ts +17 -0
- package/dist/constants.js +23 -0
- package/dist/core/scope/scopeManager.js +3 -0
- package/dist/core/security/ipGuard.d.ts +11 -0
- package/dist/core/security/ipGuard.js +71 -3
- package/dist/crawler/crawl.d.ts +4 -22
- package/dist/crawler/crawl.js +4 -335
- package/dist/crawler/crawler.d.ts +87 -0
- package/dist/crawler/crawler.js +683 -0
- package/dist/crawler/extract.d.ts +4 -1
- package/dist/crawler/extract.js +7 -2
- package/dist/crawler/fetcher.d.ts +2 -1
- package/dist/crawler/fetcher.js +26 -11
- package/dist/crawler/metricsRunner.d.ts +23 -1
- package/dist/crawler/metricsRunner.js +202 -72
- package/dist/crawler/normalize.d.ts +41 -0
- package/dist/crawler/normalize.js +119 -3
- package/dist/crawler/parser.d.ts +1 -3
- package/dist/crawler/parser.js +2 -49
- package/dist/crawler/resolver.d.ts +11 -0
- package/dist/crawler/resolver.js +67 -0
- package/dist/crawler/sitemap.d.ts +6 -0
- package/dist/crawler/sitemap.js +27 -17
- package/dist/crawler/trap.d.ts +5 -1
- package/dist/crawler/trap.js +23 -2
- package/dist/db/CrawlithDB.d.ts +110 -0
- package/dist/db/CrawlithDB.js +500 -0
- package/dist/db/graphLoader.js +42 -30
- package/dist/db/index.d.ts +11 -0
- package/dist/db/index.js +41 -29
- package/dist/db/migrations.d.ts +2 -0
- package/dist/db/{schema.js → migrations.js} +90 -43
- package/dist/db/pluginRegistry.d.ts +9 -0
- package/dist/db/pluginRegistry.js +19 -0
- package/dist/db/repositories/EdgeRepository.d.ts +13 -0
- package/dist/db/repositories/EdgeRepository.js +20 -0
- package/dist/db/repositories/MetricsRepository.d.ts +16 -8
- package/dist/db/repositories/MetricsRepository.js +28 -7
- package/dist/db/repositories/PageRepository.d.ts +15 -2
- package/dist/db/repositories/PageRepository.js +169 -25
- package/dist/db/repositories/SiteRepository.d.ts +9 -0
- package/dist/db/repositories/SiteRepository.js +13 -0
- package/dist/db/repositories/SnapshotRepository.d.ts +14 -5
- package/dist/db/repositories/SnapshotRepository.js +64 -5
- package/dist/db/reset.d.ts +9 -0
- package/dist/db/reset.js +32 -0
- package/dist/db/statements.d.ts +12 -0
- package/dist/db/statements.js +40 -0
- package/dist/diff/compare.d.ts +0 -5
- package/dist/diff/compare.js +0 -12
- package/dist/diff/service.d.ts +16 -0
- package/dist/diff/service.js +41 -0
- package/dist/domain/index.d.ts +4 -0
- package/dist/domain/index.js +4 -0
- package/dist/events.d.ts +56 -0
- package/dist/events.js +1 -0
- package/dist/graph/graph.d.ts +36 -42
- package/dist/graph/graph.js +26 -17
- package/dist/graph/hits.d.ts +23 -0
- package/dist/graph/hits.js +111 -0
- package/dist/graph/metrics.d.ts +0 -4
- package/dist/graph/metrics.js +25 -9
- package/dist/graph/pagerank.d.ts +17 -4
- package/dist/graph/pagerank.js +126 -91
- package/dist/graph/simhash.d.ts +6 -0
- package/dist/graph/simhash.js +14 -0
- package/dist/index.d.ts +29 -8
- package/dist/index.js +29 -8
- package/dist/lock/hashKey.js +1 -1
- package/dist/lock/lockManager.d.ts +5 -1
- package/dist/lock/lockManager.js +38 -13
- package/dist/plugin-system/plugin-cli.d.ts +10 -0
- package/dist/plugin-system/plugin-cli.js +31 -0
- package/dist/plugin-system/plugin-config.d.ts +16 -0
- package/dist/plugin-system/plugin-config.js +36 -0
- package/dist/plugin-system/plugin-loader.d.ts +17 -0
- package/dist/plugin-system/plugin-loader.js +122 -0
- package/dist/plugin-system/plugin-registry.d.ts +25 -0
- package/dist/plugin-system/plugin-registry.js +167 -0
- package/dist/plugin-system/plugin-types.d.ts +205 -0
- package/dist/plugin-system/plugin-types.js +1 -0
- package/dist/ports/index.d.ts +9 -0
- package/dist/ports/index.js +1 -0
- package/{src/report/sitegraph_template.ts → dist/report/crawl.html} +330 -81
- package/dist/report/crawlExport.d.ts +3 -0
- package/dist/report/{sitegraphExport.js → crawlExport.js} +3 -3
- package/dist/report/crawl_template.d.ts +1 -0
- package/dist/report/crawl_template.js +7 -0
- package/dist/report/export.d.ts +3 -0
- package/dist/report/export.js +81 -0
- package/dist/report/html.js +15 -216
- package/dist/report/insight.d.ts +27 -0
- package/dist/report/insight.js +103 -0
- package/dist/scoring/health.d.ts +56 -0
- package/dist/scoring/health.js +213 -0
- package/dist/utils/chalk.d.ts +6 -0
- package/dist/utils/chalk.js +41 -0
- package/dist/utils/secureConfig.d.ts +23 -0
- package/dist/utils/secureConfig.js +128 -0
- package/package.json +12 -6
- package/CHANGELOG.md +0 -7
- package/dist/db/schema.d.ts +0 -2
- package/dist/graph/cluster.d.ts +0 -6
- package/dist/graph/cluster.js +0 -173
- package/dist/graph/duplicate.d.ts +0 -10
- package/dist/graph/duplicate.js +0 -251
- package/dist/report/sitegraphExport.d.ts +0 -3
- package/dist/report/sitegraph_template.d.ts +0 -1
- package/dist/report/sitegraph_template.js +0 -630
- package/dist/scoring/hits.d.ts +0 -9
- package/dist/scoring/hits.js +0 -111
- package/src/analysis/analyze.ts +0 -548
- package/src/analysis/content.ts +0 -62
- package/src/analysis/images.ts +0 -28
- package/src/analysis/links.ts +0 -41
- package/src/analysis/scoring.ts +0 -59
- package/src/analysis/seo.ts +0 -82
- package/src/analysis/structuredData.ts +0 -62
- package/src/audit/dns.ts +0 -49
- package/src/audit/headers.ts +0 -98
- package/src/audit/index.ts +0 -66
- package/src/audit/scoring.ts +0 -232
- package/src/audit/transport.ts +0 -258
- package/src/audit/types.ts +0 -102
- package/src/core/network/proxyAdapter.ts +0 -21
- package/src/core/network/rateLimiter.ts +0 -39
- package/src/core/network/redirectController.ts +0 -47
- package/src/core/network/responseLimiter.ts +0 -34
- package/src/core/network/retryPolicy.ts +0 -57
- package/src/core/scope/domainFilter.ts +0 -45
- package/src/core/scope/scopeManager.ts +0 -52
- package/src/core/scope/subdomainPolicy.ts +0 -39
- package/src/core/security/ipGuard.ts +0 -92
- package/src/crawler/crawl.ts +0 -382
- package/src/crawler/extract.ts +0 -34
- package/src/crawler/fetcher.ts +0 -233
- package/src/crawler/metricsRunner.ts +0 -124
- package/src/crawler/normalize.ts +0 -108
- package/src/crawler/parser.ts +0 -190
- package/src/crawler/sitemap.ts +0 -73
- package/src/crawler/trap.ts +0 -96
- package/src/db/graphLoader.ts +0 -105
- package/src/db/index.ts +0 -70
- package/src/db/repositories/EdgeRepository.ts +0 -29
- package/src/db/repositories/MetricsRepository.ts +0 -49
- package/src/db/repositories/PageRepository.ts +0 -128
- package/src/db/repositories/SiteRepository.ts +0 -32
- package/src/db/repositories/SnapshotRepository.ts +0 -74
- package/src/db/schema.ts +0 -177
- package/src/diff/compare.ts +0 -84
- package/src/graph/cluster.ts +0 -192
- package/src/graph/duplicate.ts +0 -286
- package/src/graph/graph.ts +0 -172
- package/src/graph/metrics.ts +0 -110
- package/src/graph/pagerank.ts +0 -125
- package/src/graph/simhash.ts +0 -61
- package/src/index.ts +0 -30
- package/src/lock/hashKey.ts +0 -51
- package/src/lock/lockManager.ts +0 -124
- package/src/lock/pidCheck.ts +0 -13
- package/src/report/html.ts +0 -227
- package/src/report/sitegraphExport.ts +0 -58
- package/src/scoring/hits.ts +0 -131
- package/src/scoring/orphanSeverity.ts +0 -176
- package/src/utils/version.ts +0 -18
- package/tests/__snapshots__/orphanSeverity.test.ts.snap +0 -49
- package/tests/analysis.unit.test.ts +0 -98
- package/tests/analyze.integration.test.ts +0 -98
- package/tests/audit/dns.test.ts +0 -31
- package/tests/audit/headers.test.ts +0 -45
- package/tests/audit/scoring.test.ts +0 -133
- package/tests/audit/security.test.ts +0 -12
- package/tests/audit/transport.test.ts +0 -112
- package/tests/clustering.test.ts +0 -118
- package/tests/crawler.test.ts +0 -358
- package/tests/db.test.ts +0 -159
- package/tests/diff.test.ts +0 -67
- package/tests/duplicate.test.ts +0 -110
- package/tests/fetcher.test.ts +0 -106
- package/tests/fetcher_safety.test.ts +0 -85
- package/tests/fixtures/analyze-crawl.json +0 -26
- package/tests/hits.test.ts +0 -134
- package/tests/html_report.test.ts +0 -58
- package/tests/lock/lockManager.test.ts +0 -138
- package/tests/metrics.test.ts +0 -196
- package/tests/normalize.test.ts +0 -101
- package/tests/orphanSeverity.test.ts +0 -160
- package/tests/pagerank.test.ts +0 -98
- package/tests/parser.test.ts +0 -117
- package/tests/proxy_safety.test.ts +0 -57
- package/tests/redirect_safety.test.ts +0 -73
- package/tests/safety.test.ts +0 -114
- package/tests/scope.test.ts +0 -66
- package/tests/scoring.test.ts +0 -59
- package/tests/sitemap.test.ts +0 -88
- package/tests/soft404.test.ts +0 -41
- package/tests/trap.test.ts +0 -39
- package/tests/visualization_data.test.ts +0 -46
- package/tsconfig.json +0 -11
package/dist/events.d.ts
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
export type CrawlEvent = {
|
|
2
|
+
type: 'crawl:start';
|
|
3
|
+
url: string;
|
|
4
|
+
} | {
|
|
5
|
+
type: 'crawl:success';
|
|
6
|
+
url: string;
|
|
7
|
+
status: number;
|
|
8
|
+
durationMs: number;
|
|
9
|
+
depth?: number;
|
|
10
|
+
} | {
|
|
11
|
+
type: 'crawl:error';
|
|
12
|
+
url: string;
|
|
13
|
+
error: string;
|
|
14
|
+
depth?: number;
|
|
15
|
+
} | {
|
|
16
|
+
type: 'crawl:limit-reached';
|
|
17
|
+
limit: number;
|
|
18
|
+
} | {
|
|
19
|
+
type: 'crawl:progress';
|
|
20
|
+
pagesCrawled: number;
|
|
21
|
+
queued: number;
|
|
22
|
+
active: number;
|
|
23
|
+
nodesFound: number;
|
|
24
|
+
edgesFound: number;
|
|
25
|
+
phase?: string;
|
|
26
|
+
} | {
|
|
27
|
+
type: 'queue:enqueue';
|
|
28
|
+
url: string;
|
|
29
|
+
depth: number;
|
|
30
|
+
} | {
|
|
31
|
+
type: 'metrics:start';
|
|
32
|
+
phase: string;
|
|
33
|
+
} | {
|
|
34
|
+
type: 'metrics:complete';
|
|
35
|
+
durationMs: number;
|
|
36
|
+
} | {
|
|
37
|
+
type: 'debug';
|
|
38
|
+
message: string;
|
|
39
|
+
context?: unknown;
|
|
40
|
+
} | {
|
|
41
|
+
type: 'info';
|
|
42
|
+
message: string;
|
|
43
|
+
context?: unknown;
|
|
44
|
+
} | {
|
|
45
|
+
type: 'warn';
|
|
46
|
+
message: string;
|
|
47
|
+
context?: unknown;
|
|
48
|
+
} | {
|
|
49
|
+
type: 'error';
|
|
50
|
+
message: string;
|
|
51
|
+
error?: unknown;
|
|
52
|
+
context?: unknown;
|
|
53
|
+
};
|
|
54
|
+
export interface EngineContext {
|
|
55
|
+
emit: (event: CrawlEvent) => void;
|
|
56
|
+
}
|
package/dist/events.js
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
package/dist/graph/graph.d.ts
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
export interface GraphNode {
|
|
2
2
|
url: string;
|
|
3
|
+
isInternal?: boolean;
|
|
3
4
|
depth: number;
|
|
4
5
|
inLinks: number;
|
|
5
6
|
outLinks: number;
|
|
@@ -9,45 +10,48 @@ export interface GraphNode {
|
|
|
9
10
|
nofollow?: boolean;
|
|
10
11
|
brokenLinks?: string[];
|
|
11
12
|
redirectChain?: string[];
|
|
13
|
+
discoveredViaSitemap?: boolean;
|
|
12
14
|
incrementalStatus?: 'new' | 'changed' | 'unchanged' | 'deleted';
|
|
13
15
|
etag?: string;
|
|
14
16
|
lastModified?: string;
|
|
15
17
|
contentHash?: string;
|
|
16
18
|
html?: string;
|
|
17
|
-
pageRank?: number;
|
|
18
|
-
pageRankScore?: number;
|
|
19
|
-
authorityScore?: number;
|
|
20
|
-
hubScore?: number;
|
|
21
|
-
duplicateClusterId?: string;
|
|
22
|
-
duplicateType?: 'exact' | 'near' | 'template_heavy' | 'none';
|
|
23
|
-
isClusterPrimary?: boolean;
|
|
24
|
-
isCollapsed?: boolean;
|
|
25
|
-
collapseInto?: string;
|
|
26
19
|
simhash?: string;
|
|
27
20
|
uniqueTokenRatio?: number;
|
|
28
|
-
soft404Score?: number;
|
|
29
|
-
soft404Signals?: string[];
|
|
30
21
|
crawlTrapFlag?: boolean;
|
|
31
22
|
crawlTrapRisk?: number;
|
|
32
23
|
trapType?: string;
|
|
33
24
|
securityError?: string;
|
|
34
25
|
retries?: number;
|
|
35
|
-
clusterId?: number;
|
|
36
26
|
bytesReceived?: number;
|
|
37
|
-
|
|
27
|
+
crawlStatus?: string;
|
|
28
|
+
wordCount?: number;
|
|
29
|
+
thinContentScore?: number;
|
|
30
|
+
externalLinkRatio?: number;
|
|
31
|
+
h1Count?: number;
|
|
32
|
+
h2Count?: number;
|
|
33
|
+
title?: string;
|
|
34
|
+
clusterId?: number;
|
|
35
|
+
duplicateClusterId?: string;
|
|
36
|
+
duplicateType?: 'exact' | 'near' | 'template_heavy';
|
|
37
|
+
pagerankScore?: number;
|
|
38
|
+
hubScore?: number;
|
|
39
|
+
authScore?: number;
|
|
40
|
+
linkRole?: string;
|
|
41
|
+
soft404Score?: number;
|
|
42
|
+
headingScore?: number;
|
|
43
|
+
orphanScore?: number;
|
|
44
|
+
orphanType?: string;
|
|
45
|
+
impactLevel?: string;
|
|
46
|
+
headingData?: string;
|
|
47
|
+
isClusterPrimary?: boolean;
|
|
48
|
+
isCollapsed?: boolean;
|
|
38
49
|
}
|
|
39
50
|
export interface GraphEdge {
|
|
40
51
|
source: string;
|
|
41
52
|
target: string;
|
|
42
53
|
weight: number;
|
|
43
54
|
}
|
|
44
|
-
export interface ClusterInfo {
|
|
45
|
-
id: number;
|
|
46
|
-
count: number;
|
|
47
|
-
primaryUrl: string;
|
|
48
|
-
risk: 'low' | 'medium' | 'high';
|
|
49
|
-
sharedPathPrefix?: string;
|
|
50
|
-
}
|
|
51
55
|
export interface CrawlStats {
|
|
52
56
|
pagesFetched: number;
|
|
53
57
|
pagesCached: number;
|
|
@@ -59,25 +63,23 @@ export declare class Graph {
|
|
|
59
63
|
edges: Map<string, number>;
|
|
60
64
|
limitReached: boolean;
|
|
61
65
|
sessionStats: CrawlStats;
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
}[];
|
|
74
|
-
contentClusters: ClusterInfo[];
|
|
66
|
+
/**
|
|
67
|
+
* Generates a unique key for an edge.
|
|
68
|
+
*/
|
|
69
|
+
static getEdgeKey(source: string, target: string): string;
|
|
70
|
+
/**
|
|
71
|
+
* Parses an edge key back into source and target.
|
|
72
|
+
*/
|
|
73
|
+
static parseEdgeKey(key: string): {
|
|
74
|
+
source: string;
|
|
75
|
+
target: string;
|
|
76
|
+
};
|
|
75
77
|
/**
|
|
76
78
|
* Adds a node to the graph if it doesn't exist.
|
|
77
79
|
* If it exists, updates the status if the new status is non-zero (meaning we crawled it).
|
|
78
80
|
* Depth is only set on creation (BFS guarantees shortest path first).
|
|
79
81
|
*/
|
|
80
|
-
addNode(url: string, depth: number, status?: number): void;
|
|
82
|
+
addNode(url: string, depth: number, status?: number, isInternal?: boolean): void;
|
|
81
83
|
updateNodeData(url: string, data: Partial<GraphNode>): void;
|
|
82
84
|
/**
|
|
83
85
|
* Adds a directed edge between two nodes.
|
|
@@ -90,14 +92,6 @@ export declare class Graph {
|
|
|
90
92
|
toJSON(): {
|
|
91
93
|
nodes: GraphNode[];
|
|
92
94
|
edges: GraphEdge[];
|
|
93
|
-
duplicateClusters: {
|
|
94
|
-
id: string;
|
|
95
|
-
type: "exact" | "near" | "template_heavy";
|
|
96
|
-
size: number;
|
|
97
|
-
representative: string;
|
|
98
|
-
severity: "low" | "medium" | "high";
|
|
99
|
-
}[];
|
|
100
|
-
contentClusters: ClusterInfo[];
|
|
101
95
|
};
|
|
102
96
|
static fromJSON(json: any): Graph;
|
|
103
97
|
}
|
package/dist/graph/graph.js
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
export class Graph {
|
|
2
2
|
nodes = new Map();
|
|
3
|
-
// Using string
|
|
3
|
+
// Using JSON string of [source, target] to ensure uniqueness. Mapping to weight.
|
|
4
4
|
edges = new Map();
|
|
5
5
|
limitReached = false;
|
|
6
6
|
sessionStats = {
|
|
@@ -9,19 +9,33 @@ export class Graph {
|
|
|
9
9
|
pagesSkipped: 0,
|
|
10
10
|
totalFound: 0
|
|
11
11
|
};
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
12
|
+
/**
|
|
13
|
+
* Generates a unique key for an edge.
|
|
14
|
+
*/
|
|
15
|
+
static getEdgeKey(source, target) {
|
|
16
|
+
return source + '\x00' + target;
|
|
17
|
+
}
|
|
18
|
+
/**
|
|
19
|
+
* Parses an edge key back into source and target.
|
|
20
|
+
*/
|
|
21
|
+
static parseEdgeKey(key) {
|
|
22
|
+
const splitIndex = key.indexOf('\x00');
|
|
23
|
+
return {
|
|
24
|
+
source: key.slice(0, splitIndex),
|
|
25
|
+
target: key.slice(splitIndex + 1)
|
|
26
|
+
};
|
|
27
|
+
}
|
|
15
28
|
/**
|
|
16
29
|
* Adds a node to the graph if it doesn't exist.
|
|
17
30
|
* If it exists, updates the status if the new status is non-zero (meaning we crawled it).
|
|
18
31
|
* Depth is only set on creation (BFS guarantees shortest path first).
|
|
19
32
|
*/
|
|
20
|
-
addNode(url, depth, status = 0) {
|
|
33
|
+
addNode(url, depth, status = 0, isInternal = true) {
|
|
21
34
|
const existing = this.nodes.get(url);
|
|
22
35
|
if (!existing) {
|
|
23
36
|
this.nodes.set(url, {
|
|
24
37
|
url,
|
|
38
|
+
isInternal,
|
|
25
39
|
depth,
|
|
26
40
|
status,
|
|
27
41
|
inLinks: 0,
|
|
@@ -33,6 +47,9 @@ export class Graph {
|
|
|
33
47
|
if (status !== 0) {
|
|
34
48
|
existing.status = status;
|
|
35
49
|
}
|
|
50
|
+
if (isInternal !== undefined) {
|
|
51
|
+
existing.isInternal = isInternal;
|
|
52
|
+
}
|
|
36
53
|
}
|
|
37
54
|
}
|
|
38
55
|
updateNodeData(url, data) {
|
|
@@ -50,7 +67,7 @@ export class Graph {
|
|
|
50
67
|
const sourceNode = this.nodes.get(source);
|
|
51
68
|
const targetNode = this.nodes.get(target);
|
|
52
69
|
if (sourceNode && targetNode) {
|
|
53
|
-
const edgeKey =
|
|
70
|
+
const edgeKey = Graph.getEdgeKey(source, target);
|
|
54
71
|
if (!this.edges.has(edgeKey)) {
|
|
55
72
|
this.edges.set(edgeKey, weight);
|
|
56
73
|
sourceNode.outLinks++;
|
|
@@ -70,16 +87,14 @@ export class Graph {
|
|
|
70
87
|
}
|
|
71
88
|
getEdges() {
|
|
72
89
|
return Array.from(this.edges.entries()).map(([edge, weight]) => {
|
|
73
|
-
const
|
|
90
|
+
const { source, target } = Graph.parseEdgeKey(edge);
|
|
74
91
|
return { source, target, weight };
|
|
75
92
|
});
|
|
76
93
|
}
|
|
77
94
|
toJSON() {
|
|
78
95
|
return {
|
|
79
96
|
nodes: this.getNodes(),
|
|
80
|
-
edges: this.getEdges()
|
|
81
|
-
duplicateClusters: this.duplicateClusters,
|
|
82
|
-
contentClusters: this.contentClusters
|
|
97
|
+
edges: this.getEdges()
|
|
83
98
|
};
|
|
84
99
|
}
|
|
85
100
|
static fromJSON(json) {
|
|
@@ -91,16 +106,10 @@ export class Graph {
|
|
|
91
106
|
}
|
|
92
107
|
if (json.edges) {
|
|
93
108
|
for (const edge of json.edges) {
|
|
94
|
-
const key =
|
|
109
|
+
const key = Graph.getEdgeKey(edge.source, edge.target);
|
|
95
110
|
graph.edges.set(key, edge.weight || 1.0);
|
|
96
111
|
}
|
|
97
112
|
}
|
|
98
|
-
if (json.duplicateClusters) {
|
|
99
|
-
graph.duplicateClusters = json.duplicateClusters;
|
|
100
|
-
}
|
|
101
|
-
if (json.contentClusters) {
|
|
102
|
-
graph.contentClusters = json.contentClusters;
|
|
103
|
-
}
|
|
104
113
|
return graph;
|
|
105
114
|
}
|
|
106
115
|
}
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
import { Graph } from './graph.js';
|
|
2
|
+
export type LinkRole = 'hub' | 'authority' | 'power' | 'balanced' | 'peripheral';
|
|
3
|
+
export interface HITSRow {
|
|
4
|
+
authority_score: number;
|
|
5
|
+
hub_score: number;
|
|
6
|
+
link_role: LinkRole;
|
|
7
|
+
}
|
|
8
|
+
export interface HITSOptions {
|
|
9
|
+
iterations?: number;
|
|
10
|
+
}
|
|
11
|
+
/**
|
|
12
|
+
* Service to compute Hub and Authority scores using the HITS algorithm.
|
|
13
|
+
* Operates purely on the internal link graph.
|
|
14
|
+
*/
|
|
15
|
+
export declare class HITSService {
|
|
16
|
+
/**
|
|
17
|
+
* Computes Hub and Authority scores using the HITS algorithm.
|
|
18
|
+
* @param {Graph} graph - The link graph to analyze.
|
|
19
|
+
* @param {HITSOptions} options - Algorithm options (e.g. number of iterations).
|
|
20
|
+
* @returns {Map<string, HITSRow>} A map of page URLs to their HITS results.
|
|
21
|
+
*/
|
|
22
|
+
evaluate(graph: Graph, options?: HITSOptions): Map<string, HITSRow>;
|
|
23
|
+
}
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Service to compute Hub and Authority scores using the HITS algorithm.
|
|
3
|
+
* Operates purely on the internal link graph.
|
|
4
|
+
*/
|
|
5
|
+
export class HITSService {
|
|
6
|
+
/**
|
|
7
|
+
* Computes Hub and Authority scores using the HITS algorithm.
|
|
8
|
+
* @param {Graph} graph - The link graph to analyze.
|
|
9
|
+
* @param {HITSOptions} options - Algorithm options (e.g. number of iterations).
|
|
10
|
+
* @returns {Map<string, HITSRow>} A map of page URLs to their HITS results.
|
|
11
|
+
*/
|
|
12
|
+
evaluate(graph, options = {}) {
|
|
13
|
+
const iterations = options.iterations || 20;
|
|
14
|
+
const nodes = graph.getNodes();
|
|
15
|
+
// 1. Filter eligible nodes
|
|
16
|
+
const eligibleNodes = nodes.filter(n => (n.status === 200 || n.status === 0) &&
|
|
17
|
+
(!n.redirectChain || n.redirectChain.length === 0) &&
|
|
18
|
+
!n.noindex);
|
|
19
|
+
const N = eligibleNodes.length;
|
|
20
|
+
const results = new Map();
|
|
21
|
+
if (N === 0)
|
|
22
|
+
return results;
|
|
23
|
+
// Map URL to Index for O(1) access
|
|
24
|
+
const urlToIndex = new Map();
|
|
25
|
+
for (let i = 0; i < N; i++) {
|
|
26
|
+
urlToIndex.set(eligibleNodes[i].url, i);
|
|
27
|
+
}
|
|
28
|
+
// Build Adjacency Lists
|
|
29
|
+
const incoming = new Array(N).fill(null).map(() => []);
|
|
30
|
+
const outgoing = new Array(N).fill(null).map(() => []);
|
|
31
|
+
const allEdges = graph.getEdges();
|
|
32
|
+
for (const edge of allEdges) {
|
|
33
|
+
if (edge.source === edge.target)
|
|
34
|
+
continue;
|
|
35
|
+
const sourceIndex = urlToIndex.get(edge.source);
|
|
36
|
+
const targetIndex = urlToIndex.get(edge.target);
|
|
37
|
+
if (sourceIndex !== undefined && targetIndex !== undefined) {
|
|
38
|
+
const weight = edge.weight || 1.0;
|
|
39
|
+
incoming[targetIndex].push({ sourceIndex, weight });
|
|
40
|
+
outgoing[sourceIndex].push({ targetIndex, weight });
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
// Initialize Scores
|
|
44
|
+
const authScores = new Float64Array(N).fill(1.0);
|
|
45
|
+
const hubScores = new Float64Array(N).fill(1.0);
|
|
46
|
+
// 2. Iteration
|
|
47
|
+
for (let iter = 0; iter < iterations; iter++) {
|
|
48
|
+
let normAuth = 0;
|
|
49
|
+
for (let i = 0; i < N; i++) {
|
|
50
|
+
const inLinks = incoming[i];
|
|
51
|
+
let newAuth = 0;
|
|
52
|
+
for (let j = 0; j < inLinks.length; j++) {
|
|
53
|
+
const link = inLinks[j];
|
|
54
|
+
newAuth += hubScores[link.sourceIndex] * link.weight;
|
|
55
|
+
}
|
|
56
|
+
authScores[i] = newAuth;
|
|
57
|
+
normAuth += newAuth * newAuth;
|
|
58
|
+
}
|
|
59
|
+
normAuth = Math.sqrt(normAuth);
|
|
60
|
+
if (normAuth > 0) {
|
|
61
|
+
for (let i = 0; i < N; i++)
|
|
62
|
+
authScores[i] /= normAuth;
|
|
63
|
+
}
|
|
64
|
+
let normHub = 0;
|
|
65
|
+
for (let i = 0; i < N; i++) {
|
|
66
|
+
const outLinks = outgoing[i];
|
|
67
|
+
let newHub = 0;
|
|
68
|
+
for (let j = 0; j < outLinks.length; j++) {
|
|
69
|
+
const link = outLinks[j];
|
|
70
|
+
newHub += authScores[link.targetIndex] * link.weight;
|
|
71
|
+
}
|
|
72
|
+
hubScores[i] = newHub;
|
|
73
|
+
normHub += newHub * newHub;
|
|
74
|
+
}
|
|
75
|
+
normHub = Math.sqrt(normHub);
|
|
76
|
+
if (normHub > 0) {
|
|
77
|
+
for (let i = 0; i < N; i++)
|
|
78
|
+
hubScores[i] /= normHub;
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
// 3. Classification and Result Mapping
|
|
82
|
+
const sortedAuth = [...authScores].sort((a, b) => a - b);
|
|
83
|
+
const sortedHub = [...hubScores].sort((a, b) => a - b);
|
|
84
|
+
const medianAuth = sortedAuth[Math.floor(sortedAuth.length / 2)];
|
|
85
|
+
const medianHub = sortedHub[Math.floor(sortedHub.length / 2)];
|
|
86
|
+
const maxAuth = sortedAuth[sortedAuth.length - 1];
|
|
87
|
+
const maxHub = sortedHub[sortedHub.length - 1];
|
|
88
|
+
for (let i = 0; i < N; i++) {
|
|
89
|
+
const auth = authScores[i];
|
|
90
|
+
const hub = hubScores[i];
|
|
91
|
+
const url = eligibleNodes[i].url;
|
|
92
|
+
const isHighAuth = (auth > medianAuth || (auth === maxAuth && auth > 0)) && auth > 0.00001;
|
|
93
|
+
const isHighHub = (hub > medianHub || (hub === maxHub && hub > 0)) && hub > 0.00001;
|
|
94
|
+
let link_role = 'peripheral';
|
|
95
|
+
if (isHighAuth && isHighHub)
|
|
96
|
+
link_role = 'power';
|
|
97
|
+
else if (isHighAuth)
|
|
98
|
+
link_role = 'authority';
|
|
99
|
+
else if (isHighHub)
|
|
100
|
+
link_role = 'hub';
|
|
101
|
+
else if (auth > 0.00001 && hub > 0.00001)
|
|
102
|
+
link_role = 'balanced';
|
|
103
|
+
results.set(url, {
|
|
104
|
+
authority_score: auth,
|
|
105
|
+
hub_score: hub,
|
|
106
|
+
link_role
|
|
107
|
+
});
|
|
108
|
+
}
|
|
109
|
+
return results;
|
|
110
|
+
}
|
|
111
|
+
}
|
package/dist/graph/metrics.d.ts
CHANGED
package/dist/graph/metrics.js
CHANGED
|
@@ -3,6 +3,28 @@ export function calculateMetrics(graph, _maxDepth) {
|
|
|
3
3
|
const edges = graph.getEdges();
|
|
4
4
|
const totalPages = nodes.length;
|
|
5
5
|
const totalEdges = edges.length;
|
|
6
|
+
// Identify broken nodes
|
|
7
|
+
const brokenNodes = new Set(nodes.filter(n => n.status >= 400 || n.status === 0).map(n => n.url));
|
|
8
|
+
// Pre-compute outgoing edges per node for faster lookup
|
|
9
|
+
const outgoingEdges = new Map();
|
|
10
|
+
for (const edge of edges) {
|
|
11
|
+
let targets = outgoingEdges.get(edge.source);
|
|
12
|
+
if (!targets) {
|
|
13
|
+
targets = [];
|
|
14
|
+
outgoingEdges.set(edge.source, targets);
|
|
15
|
+
}
|
|
16
|
+
targets.push(edge.target);
|
|
17
|
+
}
|
|
18
|
+
// Populate brokenLinks per node
|
|
19
|
+
for (const node of nodes) {
|
|
20
|
+
const targets = outgoingEdges.get(node.url);
|
|
21
|
+
if (targets) {
|
|
22
|
+
const broken = targets.filter(targetUrl => brokenNodes.has(targetUrl));
|
|
23
|
+
if (broken.length > 0) {
|
|
24
|
+
node.brokenLinks = broken;
|
|
25
|
+
}
|
|
26
|
+
}
|
|
27
|
+
}
|
|
6
28
|
// Authority Score (per node)
|
|
7
29
|
const maxInLinks = nodes.reduce((max, n) => Math.max(max, n.inLinks), 0);
|
|
8
30
|
const getAuthority = (node) => {
|
|
@@ -43,16 +65,11 @@ export function calculateMetrics(graph, _maxDepth) {
|
|
|
43
65
|
}
|
|
44
66
|
}
|
|
45
67
|
// topAuthorityPages: Top 10 by authority
|
|
46
|
-
const topAuthorityPages =
|
|
47
|
-
.
|
|
68
|
+
const topAuthorityPages = nodes
|
|
69
|
+
.filter(n => n.isInternal !== false && n.status > 0)
|
|
70
|
+
.map(n => ({ url: n.url, authority: getAuthority(n) }))
|
|
48
71
|
.sort((a, b) => b.authority - a.authority)
|
|
49
72
|
.slice(0, 10);
|
|
50
|
-
// topPageRankPages: Top 10 by raw PageRank
|
|
51
|
-
const topPageRankPages = [...nodes]
|
|
52
|
-
.filter(n => n.pageRank !== undefined)
|
|
53
|
-
.map(n => ({ url: n.url, score: n.pageRank }))
|
|
54
|
-
.sort((a, b) => b.score - a.score)
|
|
55
|
-
.slice(0, 10);
|
|
56
73
|
const averageOutDegree = totalPages > 0 ? totalEdges / totalPages : 0;
|
|
57
74
|
const maxDepthFound = nodes.reduce((max, n) => Math.max(max, n.depth), 0);
|
|
58
75
|
return {
|
|
@@ -67,7 +84,6 @@ export function calculateMetrics(graph, _maxDepth) {
|
|
|
67
84
|
crawlEfficiencyScore,
|
|
68
85
|
averageDepth,
|
|
69
86
|
structuralEntropy,
|
|
70
|
-
topPageRankPages,
|
|
71
87
|
limitReached: graph.limitReached,
|
|
72
88
|
sessionStats: graph.sessionStats
|
|
73
89
|
};
|
package/dist/graph/pagerank.d.ts
CHANGED
|
@@ -1,12 +1,25 @@
|
|
|
1
1
|
import { Graph } from './graph.js';
|
|
2
|
-
interface
|
|
2
|
+
export interface PageRankRow {
|
|
3
|
+
raw_rank: number;
|
|
4
|
+
score: number;
|
|
5
|
+
}
|
|
6
|
+
export interface PageRankOptions {
|
|
3
7
|
dampingFactor?: number;
|
|
4
8
|
maxIterations?: number;
|
|
5
9
|
convergenceThreshold?: number;
|
|
6
10
|
soft404WeightThreshold?: number;
|
|
11
|
+
neutralScoreWhenFlat?: number;
|
|
7
12
|
}
|
|
8
13
|
/**
|
|
9
|
-
*
|
|
14
|
+
* Service to analyze a site's link graph and compute PageRank metrics.
|
|
15
|
+
* Runs only on the full crawl graph.
|
|
10
16
|
*/
|
|
11
|
-
export declare
|
|
12
|
-
|
|
17
|
+
export declare class PageRankService {
|
|
18
|
+
/**
|
|
19
|
+
* Computes a Production-Grade Weighted PageRank over the given graph.
|
|
20
|
+
* @param {Graph} graph - The full site graph structure.
|
|
21
|
+
* @param {PageRankOptions} options - Configuration overrides for damping factor, limits, etc.
|
|
22
|
+
* @returns {Map<string, PageRankRow>} The individual metrics keyed by exact normalized url.
|
|
23
|
+
*/
|
|
24
|
+
evaluate(graph: Graph, options?: PageRankOptions): Map<string, PageRankRow>;
|
|
25
|
+
}
|