@crawlith/core 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +201 -0
- package/README.md +70 -0
- package/dist/analysis/analyze.d.ts +29 -8
- package/dist/analysis/analyze.js +325 -221
- package/dist/analysis/clustering.d.ts +23 -0
- package/dist/analysis/clustering.js +206 -0
- package/dist/analysis/content.d.ts +1 -1
- package/dist/analysis/content.js +11 -5
- package/dist/analysis/duplicate.d.ts +34 -0
- package/dist/analysis/duplicate.js +305 -0
- package/dist/analysis/heading.d.ts +116 -0
- package/dist/analysis/heading.js +356 -0
- package/dist/analysis/images.d.ts +1 -1
- package/dist/analysis/images.js +6 -5
- package/dist/analysis/links.d.ts +1 -1
- package/dist/analysis/links.js +8 -8
- package/dist/{scoring/orphanSeverity.d.ts → analysis/orphan.d.ts} +12 -23
- package/dist/{scoring/orphanSeverity.js → analysis/orphan.js} +9 -3
- package/dist/analysis/scoring.js +4 -1
- package/dist/analysis/seo.d.ts +8 -4
- package/dist/analysis/seo.js +41 -30
- package/dist/analysis/soft404.d.ts +17 -0
- package/dist/analysis/soft404.js +62 -0
- package/dist/analysis/structuredData.d.ts +1 -1
- package/dist/analysis/structuredData.js +5 -4
- package/dist/application/index.d.ts +2 -0
- package/dist/application/index.js +2 -0
- package/dist/application/usecase.d.ts +3 -0
- package/dist/application/usecase.js +1 -0
- package/dist/application/usecases.d.ts +114 -0
- package/dist/application/usecases.js +201 -0
- package/dist/audit/index.js +1 -1
- package/dist/audit/transport.d.ts +1 -1
- package/dist/audit/transport.js +5 -4
- package/dist/audit/types.d.ts +1 -0
- package/dist/constants.d.ts +17 -0
- package/dist/constants.js +23 -0
- package/dist/core/scope/scopeManager.js +3 -0
- package/dist/crawler/crawl.d.ts +2 -2
- package/dist/crawler/crawler.d.ts +17 -5
- package/dist/crawler/crawler.js +259 -94
- package/dist/crawler/fetcher.d.ts +1 -1
- package/dist/crawler/fetcher.js +6 -6
- package/dist/crawler/metricsRunner.d.ts +21 -1
- package/dist/crawler/metricsRunner.js +181 -60
- package/dist/crawler/normalize.d.ts +41 -0
- package/dist/crawler/normalize.js +119 -3
- package/dist/crawler/parser.d.ts +1 -3
- package/dist/crawler/parser.js +2 -49
- package/dist/crawler/resolver.d.ts +11 -0
- package/dist/crawler/resolver.js +67 -0
- package/dist/crawler/sitemap.d.ts +4 -1
- package/dist/crawler/sitemap.js +24 -18
- package/dist/crawler/trap.d.ts +5 -1
- package/dist/crawler/trap.js +23 -2
- package/dist/db/CrawlithDB.d.ts +110 -0
- package/dist/db/CrawlithDB.js +500 -0
- package/dist/db/graphLoader.js +15 -32
- package/dist/db/index.d.ts +9 -1
- package/dist/db/index.js +39 -31
- package/dist/db/migrations.d.ts +2 -0
- package/dist/db/{schema.js → migrations.js} +90 -43
- package/dist/db/pluginRegistry.d.ts +9 -0
- package/dist/db/pluginRegistry.js +19 -0
- package/dist/db/repositories/EdgeRepository.d.ts +5 -0
- package/dist/db/repositories/EdgeRepository.js +7 -0
- package/dist/db/repositories/MetricsRepository.d.ts +13 -8
- package/dist/db/repositories/MetricsRepository.js +14 -6
- package/dist/db/repositories/PageRepository.d.ts +5 -3
- package/dist/db/repositories/PageRepository.js +68 -17
- package/dist/db/repositories/SiteRepository.d.ts +6 -0
- package/dist/db/repositories/SiteRepository.js +4 -0
- package/dist/db/repositories/SnapshotRepository.d.ts +12 -5
- package/dist/db/repositories/SnapshotRepository.js +48 -10
- package/dist/db/reset.d.ts +9 -0
- package/dist/db/reset.js +32 -0
- package/dist/db/statements.d.ts +12 -0
- package/dist/db/statements.js +40 -0
- package/dist/diff/compare.d.ts +0 -5
- package/dist/diff/compare.js +0 -12
- package/dist/diff/service.d.ts +16 -0
- package/dist/diff/service.js +41 -0
- package/dist/domain/index.d.ts +4 -0
- package/dist/domain/index.js +4 -0
- package/dist/events.d.ts +8 -0
- package/dist/graph/graph.d.ts +20 -42
- package/dist/graph/graph.js +12 -16
- package/dist/graph/hits.d.ts +23 -0
- package/dist/graph/hits.js +111 -0
- package/dist/graph/metrics.d.ts +0 -4
- package/dist/graph/metrics.js +19 -15
- package/dist/graph/pagerank.d.ts +17 -4
- package/dist/graph/pagerank.js +126 -93
- package/dist/index.d.ts +27 -9
- package/dist/index.js +27 -9
- package/dist/lock/lockManager.d.ts +1 -0
- package/dist/lock/lockManager.js +15 -0
- package/dist/plugin-system/plugin-cli.d.ts +10 -0
- package/dist/plugin-system/plugin-cli.js +31 -0
- package/dist/plugin-system/plugin-config.d.ts +16 -0
- package/dist/plugin-system/plugin-config.js +36 -0
- package/dist/plugin-system/plugin-loader.d.ts +17 -0
- package/dist/plugin-system/plugin-loader.js +122 -0
- package/dist/plugin-system/plugin-registry.d.ts +25 -0
- package/dist/plugin-system/plugin-registry.js +167 -0
- package/dist/plugin-system/plugin-types.d.ts +205 -0
- package/dist/plugin-system/plugin-types.js +1 -0
- package/dist/ports/index.d.ts +9 -0
- package/dist/ports/index.js +1 -0
- package/dist/report/export.d.ts +3 -0
- package/dist/report/export.js +81 -0
- package/dist/report/insight.d.ts +27 -0
- package/dist/report/insight.js +103 -0
- package/dist/scoring/health.d.ts +17 -11
- package/dist/scoring/health.js +183 -140
- package/dist/utils/chalk.d.ts +6 -0
- package/dist/utils/chalk.js +41 -0
- package/dist/utils/secureConfig.d.ts +23 -0
- package/dist/utils/secureConfig.js +128 -0
- package/package.json +10 -4
- package/CHANGELOG.md +0 -13
- package/dist/db/schema.d.ts +0 -2
- package/dist/graph/cluster.d.ts +0 -6
- package/dist/graph/cluster.js +0 -221
- package/dist/graph/duplicate.d.ts +0 -10
- package/dist/graph/duplicate.js +0 -302
- package/dist/scoring/hits.d.ts +0 -10
- package/dist/scoring/hits.js +0 -131
- package/scripts/copy-assets.js +0 -37
- package/src/analysis/analysis_list.html +0 -35
- package/src/analysis/analysis_page.html +0 -123
- package/src/analysis/analyze.ts +0 -505
- package/src/analysis/content.ts +0 -62
- package/src/analysis/images.ts +0 -28
- package/src/analysis/links.ts +0 -41
- package/src/analysis/scoring.ts +0 -66
- package/src/analysis/seo.ts +0 -82
- package/src/analysis/structuredData.ts +0 -62
- package/src/analysis/templates.ts +0 -9
- package/src/audit/dns.ts +0 -49
- package/src/audit/headers.ts +0 -98
- package/src/audit/index.ts +0 -66
- package/src/audit/scoring.ts +0 -232
- package/src/audit/transport.ts +0 -258
- package/src/audit/types.ts +0 -102
- package/src/core/network/proxyAdapter.ts +0 -21
- package/src/core/network/rateLimiter.ts +0 -39
- package/src/core/network/redirectController.ts +0 -47
- package/src/core/network/responseLimiter.ts +0 -34
- package/src/core/network/retryPolicy.ts +0 -57
- package/src/core/scope/domainFilter.ts +0 -45
- package/src/core/scope/scopeManager.ts +0 -52
- package/src/core/scope/subdomainPolicy.ts +0 -39
- package/src/core/security/ipGuard.ts +0 -171
- package/src/crawler/crawl.ts +0 -9
- package/src/crawler/crawler.ts +0 -601
- package/src/crawler/extract.ts +0 -39
- package/src/crawler/fetcher.ts +0 -251
- package/src/crawler/metricsRunner.ts +0 -137
- package/src/crawler/normalize.ts +0 -108
- package/src/crawler/parser.ts +0 -190
- package/src/crawler/sitemap.ts +0 -76
- package/src/crawler/trap.ts +0 -96
- package/src/db/graphLoader.ts +0 -135
- package/src/db/index.ts +0 -75
- package/src/db/repositories/EdgeRepository.ts +0 -43
- package/src/db/repositories/MetricsRepository.ts +0 -63
- package/src/db/repositories/PageRepository.ts +0 -228
- package/src/db/repositories/SiteRepository.ts +0 -43
- package/src/db/repositories/SnapshotRepository.ts +0 -99
- package/src/db/schema.ts +0 -177
- package/src/diff/compare.ts +0 -84
- package/src/events.ts +0 -16
- package/src/graph/cluster.ts +0 -246
- package/src/graph/duplicate.ts +0 -350
- package/src/graph/graph.ts +0 -192
- package/src/graph/metrics.ts +0 -125
- package/src/graph/pagerank.ts +0 -126
- package/src/graph/simhash.ts +0 -76
- package/src/index.ts +0 -33
- package/src/lock/hashKey.ts +0 -51
- package/src/lock/lockManager.ts +0 -132
- package/src/lock/pidCheck.ts +0 -13
- package/src/report/crawl.html +0 -879
- package/src/report/crawlExport.ts +0 -58
- package/src/report/crawl_template.ts +0 -9
- package/src/report/html.ts +0 -27
- package/src/scoring/health.ts +0 -241
- package/src/scoring/hits.ts +0 -153
- package/src/scoring/orphanSeverity.ts +0 -176
- package/src/utils/version.ts +0 -18
- package/tests/__snapshots__/orphanSeverity.test.ts.snap +0 -49
- package/tests/analysis.unit.test.ts +0 -142
- package/tests/analyze.integration.test.ts +0 -133
- package/tests/analyze_markdown.test.ts +0 -98
- package/tests/audit/audit.test.ts +0 -101
- package/tests/audit/dns.test.ts +0 -31
- package/tests/audit/headers.test.ts +0 -45
- package/tests/audit/scoring.test.ts +0 -133
- package/tests/audit/security.test.ts +0 -12
- package/tests/audit/transport.test.ts +0 -111
- package/tests/clustering.test.ts +0 -118
- package/tests/clustering_risk.test.ts +0 -118
- package/tests/crawler.test.ts +0 -364
- package/tests/db/index.test.ts +0 -134
- package/tests/db/repositories.test.ts +0 -115
- package/tests/db.test.ts +0 -159
- package/tests/db_repos.test.ts +0 -72
- package/tests/diff.test.ts +0 -67
- package/tests/duplicate.test.ts +0 -110
- package/tests/extract.test.ts +0 -86
- package/tests/fetcher.test.ts +0 -110
- package/tests/fetcher_safety.test.ts +0 -91
- package/tests/fixtures/analyze-crawl.json +0 -26
- package/tests/graph/graph.test.ts +0 -100
- package/tests/graphLoader.test.ts +0 -124
- package/tests/hits.test.ts +0 -134
- package/tests/html_report.test.ts +0 -59
- package/tests/ipGuard.test.ts +0 -73
- package/tests/lock/lockManager.test.ts +0 -198
- package/tests/metrics.test.ts +0 -196
- package/tests/normalize.test.ts +0 -88
- package/tests/orphanSeverity.test.ts +0 -160
- package/tests/pagerank.test.ts +0 -98
- package/tests/parser.test.ts +0 -117
- package/tests/proxy_safety.test.ts +0 -57
- package/tests/redirect_safety.test.ts +0 -77
- package/tests/renderAnalysisCsv.test.ts +0 -183
- package/tests/safety.test.ts +0 -126
- package/tests/scope.test.ts +0 -84
- package/tests/scoring.test.ts +0 -60
- package/tests/sitemap.test.ts +0 -100
- package/tests/soft404.test.ts +0 -41
- package/tests/ssrf_fix.test.ts +0 -69
- package/tests/trap.test.ts +0 -39
- package/tests/visualization_data.test.ts +0 -46
- package/tsconfig.json +0 -11
package/src/graph/graph.ts
DELETED
|
@@ -1,192 +0,0 @@
|
|
|
1
|
-
export interface GraphNode {
|
|
2
|
-
url: string;
|
|
3
|
-
depth: number;
|
|
4
|
-
inLinks: number;
|
|
5
|
-
outLinks: number;
|
|
6
|
-
status: number;
|
|
7
|
-
canonical?: string;
|
|
8
|
-
noindex?: boolean;
|
|
9
|
-
nofollow?: boolean;
|
|
10
|
-
brokenLinks?: string[];
|
|
11
|
-
redirectChain?: string[];
|
|
12
|
-
incrementalStatus?: 'new' | 'changed' | 'unchanged' | 'deleted';
|
|
13
|
-
etag?: string;
|
|
14
|
-
lastModified?: string;
|
|
15
|
-
contentHash?: string;
|
|
16
|
-
html?: string;
|
|
17
|
-
pageRank?: number;
|
|
18
|
-
pageRankScore?: number;
|
|
19
|
-
authorityScore?: number;
|
|
20
|
-
hubScore?: number;
|
|
21
|
-
duplicateClusterId?: string;
|
|
22
|
-
duplicateType?: 'exact' | 'near' | 'template_heavy' | 'none';
|
|
23
|
-
isClusterPrimary?: boolean;
|
|
24
|
-
isCollapsed?: boolean;
|
|
25
|
-
collapseInto?: string;
|
|
26
|
-
simhash?: string;
|
|
27
|
-
uniqueTokenRatio?: number;
|
|
28
|
-
soft404Score?: number;
|
|
29
|
-
soft404Signals?: string[];
|
|
30
|
-
crawlTrapFlag?: boolean;
|
|
31
|
-
crawlTrapRisk?: number;
|
|
32
|
-
trapType?: string;
|
|
33
|
-
securityError?: string;
|
|
34
|
-
retries?: number;
|
|
35
|
-
clusterId?: number;
|
|
36
|
-
bytesReceived?: number;
|
|
37
|
-
linkRole?: 'hub' | 'authority' | 'power' | 'balanced' | 'peripheral';
|
|
38
|
-
crawlStatus?: string;
|
|
39
|
-
wordCount?: number;
|
|
40
|
-
thinContentScore?: number;
|
|
41
|
-
externalLinkRatio?: number;
|
|
42
|
-
orphanScore?: number;
|
|
43
|
-
}
|
|
44
|
-
|
|
45
|
-
export interface GraphEdge {
|
|
46
|
-
source: string;
|
|
47
|
-
target: string;
|
|
48
|
-
weight: number;
|
|
49
|
-
}
|
|
50
|
-
|
|
51
|
-
export interface ClusterInfo {
|
|
52
|
-
id: number;
|
|
53
|
-
count: number;
|
|
54
|
-
primaryUrl: string;
|
|
55
|
-
risk: 'low' | 'medium' | 'high';
|
|
56
|
-
sharedPathPrefix?: string;
|
|
57
|
-
}
|
|
58
|
-
|
|
59
|
-
export interface CrawlStats {
|
|
60
|
-
pagesFetched: number;
|
|
61
|
-
pagesCached: number;
|
|
62
|
-
pagesSkipped: number;
|
|
63
|
-
totalFound: number;
|
|
64
|
-
}
|
|
65
|
-
|
|
66
|
-
export class Graph {
|
|
67
|
-
nodes: Map<string, GraphNode> = new Map();
|
|
68
|
-
// Using JSON string of [source, target] to ensure uniqueness. Mapping to weight.
|
|
69
|
-
edges: Map<string, number> = new Map();
|
|
70
|
-
limitReached: boolean = false;
|
|
71
|
-
sessionStats: CrawlStats = {
|
|
72
|
-
pagesFetched: 0,
|
|
73
|
-
pagesCached: 0,
|
|
74
|
-
pagesSkipped: 0,
|
|
75
|
-
totalFound: 0
|
|
76
|
-
};
|
|
77
|
-
trapClusters: { pattern: string; type: string; count: number }[] = [];
|
|
78
|
-
duplicateClusters: { id: string; type: 'exact' | 'near' | 'template_heavy'; size: number; representative: string; severity: 'low' | 'medium' | 'high' }[] = [];
|
|
79
|
-
contentClusters: ClusterInfo[] = [];
|
|
80
|
-
|
|
81
|
-
/**
|
|
82
|
-
* Generates a unique key for an edge.
|
|
83
|
-
*/
|
|
84
|
-
static getEdgeKey(source: string, target: string): string {
|
|
85
|
-
return JSON.stringify([source, target]);
|
|
86
|
-
}
|
|
87
|
-
|
|
88
|
-
/**
|
|
89
|
-
* Parses an edge key back into source and target.
|
|
90
|
-
*/
|
|
91
|
-
static parseEdgeKey(key: string): { source: string; target: string } {
|
|
92
|
-
const [source, target] = JSON.parse(key);
|
|
93
|
-
return { source, target };
|
|
94
|
-
}
|
|
95
|
-
|
|
96
|
-
/**
|
|
97
|
-
* Adds a node to the graph if it doesn't exist.
|
|
98
|
-
* If it exists, updates the status if the new status is non-zero (meaning we crawled it).
|
|
99
|
-
* Depth is only set on creation (BFS guarantees shortest path first).
|
|
100
|
-
*/
|
|
101
|
-
addNode(url: string, depth: number, status: number = 0) {
|
|
102
|
-
const existing = this.nodes.get(url);
|
|
103
|
-
if (!existing) {
|
|
104
|
-
this.nodes.set(url, {
|
|
105
|
-
url,
|
|
106
|
-
depth,
|
|
107
|
-
status,
|
|
108
|
-
inLinks: 0,
|
|
109
|
-
outLinks: 0
|
|
110
|
-
});
|
|
111
|
-
} else {
|
|
112
|
-
// Update status if we have a real one now (e.g. was 0/pending, now crawled)
|
|
113
|
-
if (status !== 0) {
|
|
114
|
-
existing.status = status;
|
|
115
|
-
}
|
|
116
|
-
}
|
|
117
|
-
}
|
|
118
|
-
|
|
119
|
-
updateNodeData(url: string, data: Partial<GraphNode>) {
|
|
120
|
-
const existing = this.nodes.get(url);
|
|
121
|
-
if (existing) {
|
|
122
|
-
Object.assign(existing, data);
|
|
123
|
-
}
|
|
124
|
-
}
|
|
125
|
-
|
|
126
|
-
/**
|
|
127
|
-
* Adds a directed edge between two nodes.
|
|
128
|
-
* Both nodes must exist in the graph.
|
|
129
|
-
* Updates inLinks and outLinks counts.
|
|
130
|
-
*/
|
|
131
|
-
addEdge(source: string, target: string, weight: number = 1.0) {
|
|
132
|
-
const sourceNode = this.nodes.get(source);
|
|
133
|
-
const targetNode = this.nodes.get(target);
|
|
134
|
-
|
|
135
|
-
if (sourceNode && targetNode) {
|
|
136
|
-
const edgeKey = Graph.getEdgeKey(source, target);
|
|
137
|
-
if (!this.edges.has(edgeKey)) {
|
|
138
|
-
this.edges.set(edgeKey, weight);
|
|
139
|
-
sourceNode.outLinks++;
|
|
140
|
-
targetNode.inLinks++;
|
|
141
|
-
} else {
|
|
142
|
-
// If edge exists, keep highest weight (or could sum, but usually we just want the 'best' relationship)
|
|
143
|
-
const currentWeight = this.edges.get(edgeKey) || 0;
|
|
144
|
-
if (weight > currentWeight) {
|
|
145
|
-
this.edges.set(edgeKey, weight);
|
|
146
|
-
}
|
|
147
|
-
}
|
|
148
|
-
}
|
|
149
|
-
}
|
|
150
|
-
|
|
151
|
-
getNodes(): GraphNode[] {
|
|
152
|
-
return Array.from(this.nodes.values());
|
|
153
|
-
}
|
|
154
|
-
|
|
155
|
-
getEdges(): GraphEdge[] {
|
|
156
|
-
return Array.from(this.edges.entries()).map(([edge, weight]) => {
|
|
157
|
-
const { source, target } = Graph.parseEdgeKey(edge);
|
|
158
|
-
return { source, target, weight };
|
|
159
|
-
});
|
|
160
|
-
}
|
|
161
|
-
|
|
162
|
-
toJSON() {
|
|
163
|
-
return {
|
|
164
|
-
nodes: this.getNodes(),
|
|
165
|
-
edges: this.getEdges(),
|
|
166
|
-
duplicateClusters: this.duplicateClusters,
|
|
167
|
-
contentClusters: this.contentClusters
|
|
168
|
-
};
|
|
169
|
-
}
|
|
170
|
-
|
|
171
|
-
static fromJSON(json: any): Graph {
|
|
172
|
-
const graph = new Graph();
|
|
173
|
-
if (json.nodes) {
|
|
174
|
-
for (const node of json.nodes) {
|
|
175
|
-
graph.nodes.set(node.url, { ...node });
|
|
176
|
-
}
|
|
177
|
-
}
|
|
178
|
-
if (json.edges) {
|
|
179
|
-
for (const edge of json.edges) {
|
|
180
|
-
const key = Graph.getEdgeKey(edge.source, edge.target);
|
|
181
|
-
graph.edges.set(key, edge.weight || 1.0);
|
|
182
|
-
}
|
|
183
|
-
}
|
|
184
|
-
if (json.duplicateClusters) {
|
|
185
|
-
graph.duplicateClusters = json.duplicateClusters;
|
|
186
|
-
}
|
|
187
|
-
if (json.contentClusters) {
|
|
188
|
-
graph.contentClusters = json.contentClusters;
|
|
189
|
-
}
|
|
190
|
-
return graph;
|
|
191
|
-
}
|
|
192
|
-
}
|
package/src/graph/metrics.ts
DELETED
|
@@ -1,125 +0,0 @@
|
|
|
1
|
-
import { Graph, GraphNode } from './graph.js';
|
|
2
|
-
|
|
3
|
-
export interface Metrics {
|
|
4
|
-
totalPages: number;
|
|
5
|
-
totalEdges: number;
|
|
6
|
-
orphanPages: string[];
|
|
7
|
-
nearOrphans: string[];
|
|
8
|
-
deepPages: string[];
|
|
9
|
-
topAuthorityPages: { url: string; authority: number }[];
|
|
10
|
-
averageOutDegree: number;
|
|
11
|
-
maxDepthFound: number;
|
|
12
|
-
crawlEfficiencyScore: number;
|
|
13
|
-
averageDepth: number;
|
|
14
|
-
structuralEntropy: number;
|
|
15
|
-
topPageRankPages: { url: string; score: number }[];
|
|
16
|
-
limitReached: boolean;
|
|
17
|
-
sessionStats?: {
|
|
18
|
-
pagesFetched: number;
|
|
19
|
-
pagesCached: number;
|
|
20
|
-
pagesSkipped: number;
|
|
21
|
-
totalFound: number;
|
|
22
|
-
};
|
|
23
|
-
}
|
|
24
|
-
|
|
25
|
-
export function calculateMetrics(graph: Graph, _maxDepth: number): Metrics {
|
|
26
|
-
const nodes = graph.getNodes();
|
|
27
|
-
const edges = graph.getEdges();
|
|
28
|
-
|
|
29
|
-
const totalPages = nodes.length;
|
|
30
|
-
const totalEdges = edges.length;
|
|
31
|
-
|
|
32
|
-
// Identify broken nodes
|
|
33
|
-
const brokenNodes = new Set(nodes.filter(n => n.status >= 400 || n.status === 0).map(n => n.url));
|
|
34
|
-
|
|
35
|
-
// Populate brokenLinks per node
|
|
36
|
-
for (const node of nodes) {
|
|
37
|
-
const nodeEdges = edges.filter(e => e.source === node.url);
|
|
38
|
-
const broken = nodeEdges
|
|
39
|
-
.map(e => e.target)
|
|
40
|
-
.filter(targetUrl => brokenNodes.has(targetUrl));
|
|
41
|
-
|
|
42
|
-
if (broken.length > 0) {
|
|
43
|
-
node.brokenLinks = broken;
|
|
44
|
-
}
|
|
45
|
-
}
|
|
46
|
-
|
|
47
|
-
// Authority Score (per node)
|
|
48
|
-
const maxInLinks = nodes.reduce((max, n) => Math.max(max, n.inLinks), 0);
|
|
49
|
-
const getAuthority = (node: GraphNode) => {
|
|
50
|
-
if (maxInLinks === 0) return 0;
|
|
51
|
-
return Math.log(1 + node.inLinks) / Math.log(1 + maxInLinks);
|
|
52
|
-
};
|
|
53
|
-
|
|
54
|
-
// orphanPages: inLinks === 0 && depth > 0
|
|
55
|
-
const orphanPages = nodes
|
|
56
|
-
.filter(n => n.inLinks === 0 && n.depth > 0)
|
|
57
|
-
.map(n => n.url);
|
|
58
|
-
|
|
59
|
-
// nearOrphans: inLinks === 1 && depth >= 3
|
|
60
|
-
const nearOrphans = nodes
|
|
61
|
-
.filter(n => n.inLinks === 1 && n.depth >= 3)
|
|
62
|
-
.map(n => n.url);
|
|
63
|
-
|
|
64
|
-
// deepPages: depth >= 4
|
|
65
|
-
const deepPages = nodes
|
|
66
|
-
.filter(n => n.depth >= 4) // Per requirement
|
|
67
|
-
.map(n => n.url);
|
|
68
|
-
|
|
69
|
-
// crawlEfficiencyScore: 1 - (deepPagesCount / totalPages)
|
|
70
|
-
const deepPagesCount = deepPages.length;
|
|
71
|
-
const crawlEfficiencyScore = totalPages > 0 ? 1 - (deepPagesCount / totalPages) : 1;
|
|
72
|
-
|
|
73
|
-
// averageDepth: sum(depth) / totalPages
|
|
74
|
-
const sumDepth = nodes.reduce((acc, n) => acc + n.depth, 0);
|
|
75
|
-
const averageDepth = totalPages > 0 ? sumDepth / totalPages : 0;
|
|
76
|
-
|
|
77
|
-
// structuralEntropy: Shannon entropy over outDegree distribution
|
|
78
|
-
const outDegreeCounts = new Map<number, number>();
|
|
79
|
-
nodes.forEach(n => {
|
|
80
|
-
outDegreeCounts.set(n.outLinks, (outDegreeCounts.get(n.outLinks) || 0) + 1);
|
|
81
|
-
});
|
|
82
|
-
|
|
83
|
-
let structuralEntropy = 0;
|
|
84
|
-
if (totalPages > 0) {
|
|
85
|
-
for (const count of outDegreeCounts.values()) {
|
|
86
|
-
const p = count / totalPages;
|
|
87
|
-
if (p > 0) {
|
|
88
|
-
structuralEntropy -= p * Math.log2(p);
|
|
89
|
-
}
|
|
90
|
-
}
|
|
91
|
-
}
|
|
92
|
-
|
|
93
|
-
// topAuthorityPages: Top 10 by authority
|
|
94
|
-
const topAuthorityPages = [...nodes]
|
|
95
|
-
.map(n => ({ url: n.url, authority: n.authorityScore ?? getAuthority(n) }))
|
|
96
|
-
.sort((a, b) => b.authority - a.authority)
|
|
97
|
-
.slice(0, 10);
|
|
98
|
-
|
|
99
|
-
// topPageRankPages: Top 10 by raw PageRank
|
|
100
|
-
const topPageRankPages = [...nodes]
|
|
101
|
-
.filter(n => n.pageRank !== undefined)
|
|
102
|
-
.map(n => ({ url: n.url, score: n.pageRank! }))
|
|
103
|
-
.sort((a, b) => b.score - a.score)
|
|
104
|
-
.slice(0, 10);
|
|
105
|
-
|
|
106
|
-
const averageOutDegree = totalPages > 0 ? totalEdges / totalPages : 0;
|
|
107
|
-
const maxDepthFound = nodes.reduce((max, n) => Math.max(max, n.depth), 0);
|
|
108
|
-
|
|
109
|
-
return {
|
|
110
|
-
totalPages,
|
|
111
|
-
totalEdges,
|
|
112
|
-
orphanPages,
|
|
113
|
-
nearOrphans,
|
|
114
|
-
deepPages,
|
|
115
|
-
topAuthorityPages,
|
|
116
|
-
averageOutDegree,
|
|
117
|
-
maxDepthFound,
|
|
118
|
-
crawlEfficiencyScore,
|
|
119
|
-
averageDepth,
|
|
120
|
-
structuralEntropy,
|
|
121
|
-
topPageRankPages,
|
|
122
|
-
limitReached: graph.limitReached,
|
|
123
|
-
sessionStats: graph.sessionStats
|
|
124
|
-
};
|
|
125
|
-
}
|
package/src/graph/pagerank.ts
DELETED
|
@@ -1,126 +0,0 @@
|
|
|
1
|
-
import { Graph, GraphNode } from './graph.js';
|
|
2
|
-
|
|
3
|
-
interface PageRankOptions {
|
|
4
|
-
dampingFactor?: number;
|
|
5
|
-
maxIterations?: number;
|
|
6
|
-
convergenceThreshold?: number;
|
|
7
|
-
soft404WeightThreshold?: number;
|
|
8
|
-
}
|
|
9
|
-
|
|
10
|
-
/**
|
|
11
|
-
* Production-Grade Weighted PageRank Engine
|
|
12
|
-
*/
|
|
13
|
-
export function computePageRank(graph: Graph, options: PageRankOptions = {}) {
|
|
14
|
-
const d = options.dampingFactor ?? 0.85;
|
|
15
|
-
const maxIterations = options.maxIterations ?? 40;
|
|
16
|
-
const epsilon = options.convergenceThreshold ?? 1e-5;
|
|
17
|
-
const soft404Threshold = options.soft404WeightThreshold ?? 0.8;
|
|
18
|
-
|
|
19
|
-
const allNodes = graph.getNodes();
|
|
20
|
-
const allEdges = graph.getEdges();
|
|
21
|
-
|
|
22
|
-
// 1. Filter Eligible Nodes
|
|
23
|
-
const eligibleNodes = allNodes.filter(node => {
|
|
24
|
-
if (node.noindex) return false;
|
|
25
|
-
if (node.isCollapsed) return false;
|
|
26
|
-
if (node.soft404Score && node.soft404Score > soft404Threshold) return false;
|
|
27
|
-
if (node.canonical && node.canonical !== node.url) return false;
|
|
28
|
-
if (node.status >= 400) return false; // Don't pass rank to broken pages
|
|
29
|
-
if (node.status === 0) return false; // Don't pass rank to uncrawled/external pages
|
|
30
|
-
return true;
|
|
31
|
-
});
|
|
32
|
-
|
|
33
|
-
const nodeCount = eligibleNodes.length;
|
|
34
|
-
if (nodeCount === 0) return;
|
|
35
|
-
|
|
36
|
-
const nodeUrls = eligibleNodes.map(n => n.url);
|
|
37
|
-
const nodeMap = new Map<string, GraphNode>();
|
|
38
|
-
eligibleNodes.forEach(n => nodeMap.set(n.url, n));
|
|
39
|
-
|
|
40
|
-
// Initialize PageRank
|
|
41
|
-
let pr = new Map<string, number>();
|
|
42
|
-
nodeUrls.forEach(url => pr.set(url, 1 / nodeCount));
|
|
43
|
-
|
|
44
|
-
// Pre-calculate weighted outbound sums and inverted adjacency
|
|
45
|
-
const outWeights = new Map<string, number>();
|
|
46
|
-
const incoming = new Map<string, { source: string; weight: number }[]>();
|
|
47
|
-
const sinks: string[] = [];
|
|
48
|
-
|
|
49
|
-
// Initialize outWeights for all eligible nodes
|
|
50
|
-
nodeUrls.forEach(url => outWeights.set(url, 0));
|
|
51
|
-
|
|
52
|
-
for (const edge of allEdges) {
|
|
53
|
-
if (nodeMap.has(edge.source) && nodeMap.has(edge.target)) {
|
|
54
|
-
const weight = edge.weight || 1.0;
|
|
55
|
-
|
|
56
|
-
const sources = incoming.get(edge.target) ?? [];
|
|
57
|
-
sources.push({ source: edge.source, weight });
|
|
58
|
-
incoming.set(edge.target, sources);
|
|
59
|
-
|
|
60
|
-
outWeights.set(edge.source, (outWeights.get(edge.source) || 0) + weight);
|
|
61
|
-
}
|
|
62
|
-
}
|
|
63
|
-
|
|
64
|
-
// Identify sinks
|
|
65
|
-
nodeUrls.forEach(url => {
|
|
66
|
-
if ((outWeights.get(url) || 0) === 0) {
|
|
67
|
-
sinks.push(url);
|
|
68
|
-
}
|
|
69
|
-
});
|
|
70
|
-
|
|
71
|
-
// Iterative Calculation
|
|
72
|
-
for (let i = 0; i < maxIterations; i++) {
|
|
73
|
-
const nextPr = new Map<string, number>();
|
|
74
|
-
|
|
75
|
-
// Calculate total rank from sinks to redistribute
|
|
76
|
-
let sinkRankTotal = 0;
|
|
77
|
-
for (const url of sinks) {
|
|
78
|
-
sinkRankTotal += pr.get(url) || 0;
|
|
79
|
-
}
|
|
80
|
-
|
|
81
|
-
const baseRank = (1 - d) / nodeCount + (d * sinkRankTotal / nodeCount);
|
|
82
|
-
|
|
83
|
-
for (const url of nodeUrls) {
|
|
84
|
-
let rankFromLinks = 0;
|
|
85
|
-
const sources = incoming.get(url) || [];
|
|
86
|
-
|
|
87
|
-
for (const edge of sources) {
|
|
88
|
-
const sourceRank = pr.get(edge.source) || 0;
|
|
89
|
-
const sourceOutWeight = outWeights.get(edge.source) || 1.0;
|
|
90
|
-
rankFromLinks += sourceRank * (edge.weight / sourceOutWeight);
|
|
91
|
-
}
|
|
92
|
-
|
|
93
|
-
const newRank = baseRank + d * rankFromLinks;
|
|
94
|
-
nextPr.set(url, newRank);
|
|
95
|
-
}
|
|
96
|
-
|
|
97
|
-
// Convergence check
|
|
98
|
-
let maxDelta = 0;
|
|
99
|
-
for (const url of nodeUrls) {
|
|
100
|
-
const delta = Math.abs(nextPr.get(url)! - pr.get(url)!);
|
|
101
|
-
if (delta > maxDelta) maxDelta = delta;
|
|
102
|
-
}
|
|
103
|
-
|
|
104
|
-
pr = nextPr;
|
|
105
|
-
|
|
106
|
-
if (maxDelta < epsilon) break;
|
|
107
|
-
}
|
|
108
|
-
|
|
109
|
-
// 2. Normalization (0-100)
|
|
110
|
-
const ranks = Array.from(pr.values());
|
|
111
|
-
const minPR = Math.min(...ranks);
|
|
112
|
-
const maxPR = Math.max(...ranks);
|
|
113
|
-
const range = maxPR - minPR;
|
|
114
|
-
|
|
115
|
-
for (const node of eligibleNodes) {
|
|
116
|
-
const rawRank = pr.get(node.url)!;
|
|
117
|
-
node.pageRank = rawRank;
|
|
118
|
-
|
|
119
|
-
if (range > 1e-12) {
|
|
120
|
-
node.pageRankScore = 100 * (rawRank - minPR) / range;
|
|
121
|
-
} else {
|
|
122
|
-
// If there's no range, all eligible pages are equally important.
|
|
123
|
-
node.pageRankScore = 100;
|
|
124
|
-
}
|
|
125
|
-
}
|
|
126
|
-
}
|
package/src/graph/simhash.ts
DELETED
|
@@ -1,76 +0,0 @@
|
|
|
1
|
-
export class SimHash {
|
|
2
|
-
private static FNV_PRIME = 1099511628211n;
|
|
3
|
-
private static FNV_OFFSET_BASIS = 14695981039346656037n;
|
|
4
|
-
private static MAX_UINT64 = 0xffffffffffffffffn;
|
|
5
|
-
public static readonly BANDS = 4;
|
|
6
|
-
public static readonly BAND_WIDTH = 16;
|
|
7
|
-
|
|
8
|
-
/**
|
|
9
|
-
* Generates a 64-bit FNV-1a hash for a given string token.
|
|
10
|
-
*/
|
|
11
|
-
static fnv1a64(token: string): bigint {
|
|
12
|
-
let hash = this.FNV_OFFSET_BASIS;
|
|
13
|
-
const len = token.length;
|
|
14
|
-
for (let i = 0; i < len; i++) {
|
|
15
|
-
hash ^= BigInt(token.charCodeAt(i));
|
|
16
|
-
// BigInt safe multiplication modulo 2^64
|
|
17
|
-
hash = (hash * this.FNV_PRIME) & this.MAX_UINT64;
|
|
18
|
-
}
|
|
19
|
-
return hash;
|
|
20
|
-
}
|
|
21
|
-
|
|
22
|
-
/**
|
|
23
|
-
* Generates a 64-bit SimHash from an array of tokens.
|
|
24
|
-
*/
|
|
25
|
-
static generate(tokens: string[]): bigint {
|
|
26
|
-
const v = new Int32Array(64);
|
|
27
|
-
|
|
28
|
-
for (const token of tokens) {
|
|
29
|
-
const hash = this.fnv1a64(token);
|
|
30
|
-
for (let i = 0n; i < 64n; i++) {
|
|
31
|
-
const bit = (hash >> i) & 1n;
|
|
32
|
-
if (bit === 1n) {
|
|
33
|
-
v[Number(i)]++;
|
|
34
|
-
} else {
|
|
35
|
-
v[Number(i)]--;
|
|
36
|
-
}
|
|
37
|
-
}
|
|
38
|
-
}
|
|
39
|
-
|
|
40
|
-
let simhash = 0n;
|
|
41
|
-
for (let i = 0n; i < 64n; i++) {
|
|
42
|
-
if (v[Number(i)] > 0) {
|
|
43
|
-
simhash |= (1n << i);
|
|
44
|
-
}
|
|
45
|
-
}
|
|
46
|
-
|
|
47
|
-
return simhash;
|
|
48
|
-
}
|
|
49
|
-
|
|
50
|
-
/**
|
|
51
|
-
* Splits a 64-bit SimHash into 4 bands of 16 bits.
|
|
52
|
-
*/
|
|
53
|
-
static getBands(simhash: bigint): number[] {
|
|
54
|
-
const bands: number[] = [];
|
|
55
|
-
for (let i = 0; i < SimHash.BANDS; i++) {
|
|
56
|
-
// Extract 16-bit chunks
|
|
57
|
-
const chunk = Number((simhash >> BigInt(i * SimHash.BAND_WIDTH)) & 0xFFFFn);
|
|
58
|
-
bands.push(chunk);
|
|
59
|
-
}
|
|
60
|
-
return bands;
|
|
61
|
-
}
|
|
62
|
-
|
|
63
|
-
/**
|
|
64
|
-
* Computes the Hamming distance between two 64-bit hashes.
|
|
65
|
-
*/
|
|
66
|
-
static hammingDistance(a: bigint, b: bigint): number {
|
|
67
|
-
let xor = a ^ b;
|
|
68
|
-
let distance = 0;
|
|
69
|
-
while (xor > 0n) {
|
|
70
|
-
// Kernighan's bit counting
|
|
71
|
-
xor &= xor - 1n;
|
|
72
|
-
distance++;
|
|
73
|
-
}
|
|
74
|
-
return distance;
|
|
75
|
-
}
|
|
76
|
-
}
|
package/src/index.ts
DELETED
|
@@ -1,33 +0,0 @@
|
|
|
1
|
-
export * from './crawler/crawl.js';
|
|
2
|
-
export * from './crawler/normalize.js';
|
|
3
|
-
export * from './crawler/metricsRunner.js';
|
|
4
|
-
export * from './graph/metrics.js';
|
|
5
|
-
export * from './report/html.js';
|
|
6
|
-
export * from './report/crawl_template.js';
|
|
7
|
-
export * from './report/crawlExport.js';
|
|
8
|
-
export * from './graph/graph.js';
|
|
9
|
-
export * from './diff/compare.js';
|
|
10
|
-
export * from './scoring/orphanSeverity.js';
|
|
11
|
-
export * from './graph/pagerank.js';
|
|
12
|
-
export * from './graph/duplicate.js';
|
|
13
|
-
export * from './graph/cluster.js';
|
|
14
|
-
export * from './scoring/health.js';
|
|
15
|
-
export * from './scoring/hits.js';
|
|
16
|
-
export * from './analysis/analyze.js';
|
|
17
|
-
export * from './analysis/content.js';
|
|
18
|
-
export * from './analysis/seo.js';
|
|
19
|
-
export * from './analysis/images.js';
|
|
20
|
-
export * from './analysis/links.js';
|
|
21
|
-
export * from './audit/index.js';
|
|
22
|
-
export * from './audit/types.js';
|
|
23
|
-
export * from './db/index.js';
|
|
24
|
-
export * from './db/graphLoader.js';
|
|
25
|
-
export * from './db/repositories/SiteRepository.js';
|
|
26
|
-
export * from './db/repositories/SnapshotRepository.js';
|
|
27
|
-
export * from './db/repositories/PageRepository.js';
|
|
28
|
-
export * from './db/repositories/EdgeRepository.js';
|
|
29
|
-
export * from './db/repositories/MetricsRepository.js';
|
|
30
|
-
export * from './lock/lockManager.js';
|
|
31
|
-
export * from './lock/hashKey.js';
|
|
32
|
-
export * from './utils/version.js';
|
|
33
|
-
export * from './events.js';
|
package/src/lock/hashKey.ts
DELETED
|
@@ -1,51 +0,0 @@
|
|
|
1
|
-
import crypto from 'node:crypto';
|
|
2
|
-
import { normalizeUrl } from '../crawler/normalize.js';
|
|
3
|
-
|
|
4
|
-
// Flags that change the nature of the crawl and should be part of the lock key
|
|
5
|
-
const RELEVANT_FLAGS = [
|
|
6
|
-
'limit',
|
|
7
|
-
'depth',
|
|
8
|
-
'output',
|
|
9
|
-
'sitemap',
|
|
10
|
-
'incremental',
|
|
11
|
-
'detectSoft404',
|
|
12
|
-
'detectTraps',
|
|
13
|
-
'includeSubdomains',
|
|
14
|
-
'allow',
|
|
15
|
-
'deny',
|
|
16
|
-
'proxy',
|
|
17
|
-
'ua',
|
|
18
|
-
'maxRedirects',
|
|
19
|
-
'rate',
|
|
20
|
-
'maxBytes',
|
|
21
|
-
'concurrency'
|
|
22
|
-
];
|
|
23
|
-
|
|
24
|
-
export function generateLockKey(commandName: string, targetUrl: string, options: any): string {
|
|
25
|
-
// Respect the query stripping option consistent with crawl logic
|
|
26
|
-
const stripQuery = !options.query;
|
|
27
|
-
|
|
28
|
-
const normalizedTarget = normalizeUrl(targetUrl, '', { stripQuery }) || targetUrl;
|
|
29
|
-
|
|
30
|
-
// Extract relevant options in a deterministic order
|
|
31
|
-
const lockOptions: Record<string, any> = {};
|
|
32
|
-
for (const key of RELEVANT_FLAGS) {
|
|
33
|
-
if (options[key] !== undefined) {
|
|
34
|
-
lockOptions[key] = options[key];
|
|
35
|
-
}
|
|
36
|
-
}
|
|
37
|
-
|
|
38
|
-
// Create composite key object
|
|
39
|
-
const compositeKey = {
|
|
40
|
-
command: commandName,
|
|
41
|
-
target: normalizedTarget,
|
|
42
|
-
options: lockOptions
|
|
43
|
-
};
|
|
44
|
-
|
|
45
|
-
// Stringify and hash
|
|
46
|
-
// Since we inserted keys in a deterministic order (RELEVANT_FLAGS order),
|
|
47
|
-
// JSON.stringify will produce a stable string in V8/Node.js.
|
|
48
|
-
const stableString = JSON.stringify(compositeKey);
|
|
49
|
-
|
|
50
|
-
return crypto.createHash('sha256').update(stableString).digest('hex');
|
|
51
|
-
}
|