@crawlith/core 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +201 -0
- package/README.md +70 -0
- package/dist/analysis/analyze.d.ts +29 -8
- package/dist/analysis/analyze.js +325 -221
- package/dist/analysis/clustering.d.ts +23 -0
- package/dist/analysis/clustering.js +206 -0
- package/dist/analysis/content.d.ts +1 -1
- package/dist/analysis/content.js +11 -5
- package/dist/analysis/duplicate.d.ts +34 -0
- package/dist/analysis/duplicate.js +305 -0
- package/dist/analysis/heading.d.ts +116 -0
- package/dist/analysis/heading.js +356 -0
- package/dist/analysis/images.d.ts +1 -1
- package/dist/analysis/images.js +6 -5
- package/dist/analysis/links.d.ts +1 -1
- package/dist/analysis/links.js +8 -8
- package/dist/{scoring/orphanSeverity.d.ts → analysis/orphan.d.ts} +12 -23
- package/dist/{scoring/orphanSeverity.js → analysis/orphan.js} +9 -3
- package/dist/analysis/scoring.js +4 -1
- package/dist/analysis/seo.d.ts +8 -4
- package/dist/analysis/seo.js +41 -30
- package/dist/analysis/soft404.d.ts +17 -0
- package/dist/analysis/soft404.js +62 -0
- package/dist/analysis/structuredData.d.ts +1 -1
- package/dist/analysis/structuredData.js +5 -4
- package/dist/application/index.d.ts +2 -0
- package/dist/application/index.js +2 -0
- package/dist/application/usecase.d.ts +3 -0
- package/dist/application/usecase.js +1 -0
- package/dist/application/usecases.d.ts +114 -0
- package/dist/application/usecases.js +201 -0
- package/dist/audit/index.js +1 -1
- package/dist/audit/transport.d.ts +1 -1
- package/dist/audit/transport.js +5 -4
- package/dist/audit/types.d.ts +1 -0
- package/dist/constants.d.ts +17 -0
- package/dist/constants.js +23 -0
- package/dist/core/scope/scopeManager.js +3 -0
- package/dist/crawler/crawl.d.ts +2 -2
- package/dist/crawler/crawler.d.ts +17 -5
- package/dist/crawler/crawler.js +259 -94
- package/dist/crawler/fetcher.d.ts +1 -1
- package/dist/crawler/fetcher.js +6 -6
- package/dist/crawler/metricsRunner.d.ts +21 -1
- package/dist/crawler/metricsRunner.js +181 -60
- package/dist/crawler/normalize.d.ts +41 -0
- package/dist/crawler/normalize.js +119 -3
- package/dist/crawler/parser.d.ts +1 -3
- package/dist/crawler/parser.js +2 -49
- package/dist/crawler/resolver.d.ts +11 -0
- package/dist/crawler/resolver.js +67 -0
- package/dist/crawler/sitemap.d.ts +4 -1
- package/dist/crawler/sitemap.js +24 -18
- package/dist/crawler/trap.d.ts +5 -1
- package/dist/crawler/trap.js +23 -2
- package/dist/db/CrawlithDB.d.ts +110 -0
- package/dist/db/CrawlithDB.js +500 -0
- package/dist/db/graphLoader.js +15 -32
- package/dist/db/index.d.ts +9 -1
- package/dist/db/index.js +39 -31
- package/dist/db/migrations.d.ts +2 -0
- package/dist/db/{schema.js → migrations.js} +90 -43
- package/dist/db/pluginRegistry.d.ts +9 -0
- package/dist/db/pluginRegistry.js +19 -0
- package/dist/db/repositories/EdgeRepository.d.ts +5 -0
- package/dist/db/repositories/EdgeRepository.js +7 -0
- package/dist/db/repositories/MetricsRepository.d.ts +13 -8
- package/dist/db/repositories/MetricsRepository.js +14 -6
- package/dist/db/repositories/PageRepository.d.ts +5 -3
- package/dist/db/repositories/PageRepository.js +68 -17
- package/dist/db/repositories/SiteRepository.d.ts +6 -0
- package/dist/db/repositories/SiteRepository.js +4 -0
- package/dist/db/repositories/SnapshotRepository.d.ts +12 -5
- package/dist/db/repositories/SnapshotRepository.js +48 -10
- package/dist/db/reset.d.ts +9 -0
- package/dist/db/reset.js +32 -0
- package/dist/db/statements.d.ts +12 -0
- package/dist/db/statements.js +40 -0
- package/dist/diff/compare.d.ts +0 -5
- package/dist/diff/compare.js +0 -12
- package/dist/diff/service.d.ts +16 -0
- package/dist/diff/service.js +41 -0
- package/dist/domain/index.d.ts +4 -0
- package/dist/domain/index.js +4 -0
- package/dist/events.d.ts +8 -0
- package/dist/graph/graph.d.ts +20 -42
- package/dist/graph/graph.js +12 -16
- package/dist/graph/hits.d.ts +23 -0
- package/dist/graph/hits.js +111 -0
- package/dist/graph/metrics.d.ts +0 -4
- package/dist/graph/metrics.js +19 -15
- package/dist/graph/pagerank.d.ts +17 -4
- package/dist/graph/pagerank.js +126 -93
- package/dist/index.d.ts +27 -9
- package/dist/index.js +27 -9
- package/dist/lock/lockManager.d.ts +1 -0
- package/dist/lock/lockManager.js +15 -0
- package/dist/plugin-system/plugin-cli.d.ts +10 -0
- package/dist/plugin-system/plugin-cli.js +31 -0
- package/dist/plugin-system/plugin-config.d.ts +16 -0
- package/dist/plugin-system/plugin-config.js +36 -0
- package/dist/plugin-system/plugin-loader.d.ts +17 -0
- package/dist/plugin-system/plugin-loader.js +122 -0
- package/dist/plugin-system/plugin-registry.d.ts +25 -0
- package/dist/plugin-system/plugin-registry.js +167 -0
- package/dist/plugin-system/plugin-types.d.ts +205 -0
- package/dist/plugin-system/plugin-types.js +1 -0
- package/dist/ports/index.d.ts +9 -0
- package/dist/ports/index.js +1 -0
- package/dist/report/export.d.ts +3 -0
- package/dist/report/export.js +81 -0
- package/dist/report/insight.d.ts +27 -0
- package/dist/report/insight.js +103 -0
- package/dist/scoring/health.d.ts +17 -11
- package/dist/scoring/health.js +183 -140
- package/dist/utils/chalk.d.ts +6 -0
- package/dist/utils/chalk.js +41 -0
- package/dist/utils/secureConfig.d.ts +23 -0
- package/dist/utils/secureConfig.js +128 -0
- package/package.json +10 -4
- package/CHANGELOG.md +0 -13
- package/dist/db/schema.d.ts +0 -2
- package/dist/graph/cluster.d.ts +0 -6
- package/dist/graph/cluster.js +0 -221
- package/dist/graph/duplicate.d.ts +0 -10
- package/dist/graph/duplicate.js +0 -302
- package/dist/scoring/hits.d.ts +0 -10
- package/dist/scoring/hits.js +0 -131
- package/scripts/copy-assets.js +0 -37
- package/src/analysis/analysis_list.html +0 -35
- package/src/analysis/analysis_page.html +0 -123
- package/src/analysis/analyze.ts +0 -505
- package/src/analysis/content.ts +0 -62
- package/src/analysis/images.ts +0 -28
- package/src/analysis/links.ts +0 -41
- package/src/analysis/scoring.ts +0 -66
- package/src/analysis/seo.ts +0 -82
- package/src/analysis/structuredData.ts +0 -62
- package/src/analysis/templates.ts +0 -9
- package/src/audit/dns.ts +0 -49
- package/src/audit/headers.ts +0 -98
- package/src/audit/index.ts +0 -66
- package/src/audit/scoring.ts +0 -232
- package/src/audit/transport.ts +0 -258
- package/src/audit/types.ts +0 -102
- package/src/core/network/proxyAdapter.ts +0 -21
- package/src/core/network/rateLimiter.ts +0 -39
- package/src/core/network/redirectController.ts +0 -47
- package/src/core/network/responseLimiter.ts +0 -34
- package/src/core/network/retryPolicy.ts +0 -57
- package/src/core/scope/domainFilter.ts +0 -45
- package/src/core/scope/scopeManager.ts +0 -52
- package/src/core/scope/subdomainPolicy.ts +0 -39
- package/src/core/security/ipGuard.ts +0 -171
- package/src/crawler/crawl.ts +0 -9
- package/src/crawler/crawler.ts +0 -601
- package/src/crawler/extract.ts +0 -39
- package/src/crawler/fetcher.ts +0 -251
- package/src/crawler/metricsRunner.ts +0 -137
- package/src/crawler/normalize.ts +0 -108
- package/src/crawler/parser.ts +0 -190
- package/src/crawler/sitemap.ts +0 -76
- package/src/crawler/trap.ts +0 -96
- package/src/db/graphLoader.ts +0 -135
- package/src/db/index.ts +0 -75
- package/src/db/repositories/EdgeRepository.ts +0 -43
- package/src/db/repositories/MetricsRepository.ts +0 -63
- package/src/db/repositories/PageRepository.ts +0 -228
- package/src/db/repositories/SiteRepository.ts +0 -43
- package/src/db/repositories/SnapshotRepository.ts +0 -99
- package/src/db/schema.ts +0 -177
- package/src/diff/compare.ts +0 -84
- package/src/events.ts +0 -16
- package/src/graph/cluster.ts +0 -246
- package/src/graph/duplicate.ts +0 -350
- package/src/graph/graph.ts +0 -192
- package/src/graph/metrics.ts +0 -125
- package/src/graph/pagerank.ts +0 -126
- package/src/graph/simhash.ts +0 -76
- package/src/index.ts +0 -33
- package/src/lock/hashKey.ts +0 -51
- package/src/lock/lockManager.ts +0 -132
- package/src/lock/pidCheck.ts +0 -13
- package/src/report/crawl.html +0 -879
- package/src/report/crawlExport.ts +0 -58
- package/src/report/crawl_template.ts +0 -9
- package/src/report/html.ts +0 -27
- package/src/scoring/health.ts +0 -241
- package/src/scoring/hits.ts +0 -153
- package/src/scoring/orphanSeverity.ts +0 -176
- package/src/utils/version.ts +0 -18
- package/tests/__snapshots__/orphanSeverity.test.ts.snap +0 -49
- package/tests/analysis.unit.test.ts +0 -142
- package/tests/analyze.integration.test.ts +0 -133
- package/tests/analyze_markdown.test.ts +0 -98
- package/tests/audit/audit.test.ts +0 -101
- package/tests/audit/dns.test.ts +0 -31
- package/tests/audit/headers.test.ts +0 -45
- package/tests/audit/scoring.test.ts +0 -133
- package/tests/audit/security.test.ts +0 -12
- package/tests/audit/transport.test.ts +0 -111
- package/tests/clustering.test.ts +0 -118
- package/tests/clustering_risk.test.ts +0 -118
- package/tests/crawler.test.ts +0 -364
- package/tests/db/index.test.ts +0 -134
- package/tests/db/repositories.test.ts +0 -115
- package/tests/db.test.ts +0 -159
- package/tests/db_repos.test.ts +0 -72
- package/tests/diff.test.ts +0 -67
- package/tests/duplicate.test.ts +0 -110
- package/tests/extract.test.ts +0 -86
- package/tests/fetcher.test.ts +0 -110
- package/tests/fetcher_safety.test.ts +0 -91
- package/tests/fixtures/analyze-crawl.json +0 -26
- package/tests/graph/graph.test.ts +0 -100
- package/tests/graphLoader.test.ts +0 -124
- package/tests/hits.test.ts +0 -134
- package/tests/html_report.test.ts +0 -59
- package/tests/ipGuard.test.ts +0 -73
- package/tests/lock/lockManager.test.ts +0 -198
- package/tests/metrics.test.ts +0 -196
- package/tests/normalize.test.ts +0 -88
- package/tests/orphanSeverity.test.ts +0 -160
- package/tests/pagerank.test.ts +0 -98
- package/tests/parser.test.ts +0 -117
- package/tests/proxy_safety.test.ts +0 -57
- package/tests/redirect_safety.test.ts +0 -77
- package/tests/renderAnalysisCsv.test.ts +0 -183
- package/tests/safety.test.ts +0 -126
- package/tests/scope.test.ts +0 -84
- package/tests/scoring.test.ts +0 -60
- package/tests/sitemap.test.ts +0 -100
- package/tests/soft404.test.ts +0 -41
- package/tests/ssrf_fix.test.ts +0 -69
- package/tests/trap.test.ts +0 -39
- package/tests/visualization_data.test.ts +0 -46
- package/tsconfig.json +0 -11
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Service to compute Hub and Authority scores using the HITS algorithm.
|
|
3
|
+
* Operates purely on the internal link graph.
|
|
4
|
+
*/
|
|
5
|
+
export class HITSService {
|
|
6
|
+
/**
|
|
7
|
+
* Computes Hub and Authority scores using the HITS algorithm.
|
|
8
|
+
* @param {Graph} graph - The link graph to analyze.
|
|
9
|
+
* @param {HITSOptions} options - Algorithm options (e.g. number of iterations).
|
|
10
|
+
* @returns {Map<string, HITSRow>} A map of page URLs to their HITS results.
|
|
11
|
+
*/
|
|
12
|
+
evaluate(graph, options = {}) {
|
|
13
|
+
const iterations = options.iterations || 20;
|
|
14
|
+
const nodes = graph.getNodes();
|
|
15
|
+
// 1. Filter eligible nodes
|
|
16
|
+
const eligibleNodes = nodes.filter(n => (n.status === 200 || n.status === 0) &&
|
|
17
|
+
(!n.redirectChain || n.redirectChain.length === 0) &&
|
|
18
|
+
!n.noindex);
|
|
19
|
+
const N = eligibleNodes.length;
|
|
20
|
+
const results = new Map();
|
|
21
|
+
if (N === 0)
|
|
22
|
+
return results;
|
|
23
|
+
// Map URL to Index for O(1) access
|
|
24
|
+
const urlToIndex = new Map();
|
|
25
|
+
for (let i = 0; i < N; i++) {
|
|
26
|
+
urlToIndex.set(eligibleNodes[i].url, i);
|
|
27
|
+
}
|
|
28
|
+
// Build Adjacency Lists
|
|
29
|
+
const incoming = new Array(N).fill(null).map(() => []);
|
|
30
|
+
const outgoing = new Array(N).fill(null).map(() => []);
|
|
31
|
+
const allEdges = graph.getEdges();
|
|
32
|
+
for (const edge of allEdges) {
|
|
33
|
+
if (edge.source === edge.target)
|
|
34
|
+
continue;
|
|
35
|
+
const sourceIndex = urlToIndex.get(edge.source);
|
|
36
|
+
const targetIndex = urlToIndex.get(edge.target);
|
|
37
|
+
if (sourceIndex !== undefined && targetIndex !== undefined) {
|
|
38
|
+
const weight = edge.weight || 1.0;
|
|
39
|
+
incoming[targetIndex].push({ sourceIndex, weight });
|
|
40
|
+
outgoing[sourceIndex].push({ targetIndex, weight });
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
// Initialize Scores
|
|
44
|
+
const authScores = new Float64Array(N).fill(1.0);
|
|
45
|
+
const hubScores = new Float64Array(N).fill(1.0);
|
|
46
|
+
// 2. Iteration
|
|
47
|
+
for (let iter = 0; iter < iterations; iter++) {
|
|
48
|
+
let normAuth = 0;
|
|
49
|
+
for (let i = 0; i < N; i++) {
|
|
50
|
+
const inLinks = incoming[i];
|
|
51
|
+
let newAuth = 0;
|
|
52
|
+
for (let j = 0; j < inLinks.length; j++) {
|
|
53
|
+
const link = inLinks[j];
|
|
54
|
+
newAuth += hubScores[link.sourceIndex] * link.weight;
|
|
55
|
+
}
|
|
56
|
+
authScores[i] = newAuth;
|
|
57
|
+
normAuth += newAuth * newAuth;
|
|
58
|
+
}
|
|
59
|
+
normAuth = Math.sqrt(normAuth);
|
|
60
|
+
if (normAuth > 0) {
|
|
61
|
+
for (let i = 0; i < N; i++)
|
|
62
|
+
authScores[i] /= normAuth;
|
|
63
|
+
}
|
|
64
|
+
let normHub = 0;
|
|
65
|
+
for (let i = 0; i < N; i++) {
|
|
66
|
+
const outLinks = outgoing[i];
|
|
67
|
+
let newHub = 0;
|
|
68
|
+
for (let j = 0; j < outLinks.length; j++) {
|
|
69
|
+
const link = outLinks[j];
|
|
70
|
+
newHub += authScores[link.targetIndex] * link.weight;
|
|
71
|
+
}
|
|
72
|
+
hubScores[i] = newHub;
|
|
73
|
+
normHub += newHub * newHub;
|
|
74
|
+
}
|
|
75
|
+
normHub = Math.sqrt(normHub);
|
|
76
|
+
if (normHub > 0) {
|
|
77
|
+
for (let i = 0; i < N; i++)
|
|
78
|
+
hubScores[i] /= normHub;
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
// 3. Classification and Result Mapping
|
|
82
|
+
const sortedAuth = [...authScores].sort((a, b) => a - b);
|
|
83
|
+
const sortedHub = [...hubScores].sort((a, b) => a - b);
|
|
84
|
+
const medianAuth = sortedAuth[Math.floor(sortedAuth.length / 2)];
|
|
85
|
+
const medianHub = sortedHub[Math.floor(sortedHub.length / 2)];
|
|
86
|
+
const maxAuth = sortedAuth[sortedAuth.length - 1];
|
|
87
|
+
const maxHub = sortedHub[sortedHub.length - 1];
|
|
88
|
+
for (let i = 0; i < N; i++) {
|
|
89
|
+
const auth = authScores[i];
|
|
90
|
+
const hub = hubScores[i];
|
|
91
|
+
const url = eligibleNodes[i].url;
|
|
92
|
+
const isHighAuth = (auth > medianAuth || (auth === maxAuth && auth > 0)) && auth > 0.00001;
|
|
93
|
+
const isHighHub = (hub > medianHub || (hub === maxHub && hub > 0)) && hub > 0.00001;
|
|
94
|
+
let link_role = 'peripheral';
|
|
95
|
+
if (isHighAuth && isHighHub)
|
|
96
|
+
link_role = 'power';
|
|
97
|
+
else if (isHighAuth)
|
|
98
|
+
link_role = 'authority';
|
|
99
|
+
else if (isHighHub)
|
|
100
|
+
link_role = 'hub';
|
|
101
|
+
else if (auth > 0.00001 && hub > 0.00001)
|
|
102
|
+
link_role = 'balanced';
|
|
103
|
+
results.set(url, {
|
|
104
|
+
authority_score: auth,
|
|
105
|
+
hub_score: hub,
|
|
106
|
+
link_role
|
|
107
|
+
});
|
|
108
|
+
}
|
|
109
|
+
return results;
|
|
110
|
+
}
|
|
111
|
+
}
|
package/dist/graph/metrics.d.ts
CHANGED
package/dist/graph/metrics.js
CHANGED
|
@@ -5,14 +5,24 @@ export function calculateMetrics(graph, _maxDepth) {
|
|
|
5
5
|
const totalEdges = edges.length;
|
|
6
6
|
// Identify broken nodes
|
|
7
7
|
const brokenNodes = new Set(nodes.filter(n => n.status >= 400 || n.status === 0).map(n => n.url));
|
|
8
|
+
// Pre-compute outgoing edges per node for faster lookup
|
|
9
|
+
const outgoingEdges = new Map();
|
|
10
|
+
for (const edge of edges) {
|
|
11
|
+
let targets = outgoingEdges.get(edge.source);
|
|
12
|
+
if (!targets) {
|
|
13
|
+
targets = [];
|
|
14
|
+
outgoingEdges.set(edge.source, targets);
|
|
15
|
+
}
|
|
16
|
+
targets.push(edge.target);
|
|
17
|
+
}
|
|
8
18
|
// Populate brokenLinks per node
|
|
9
19
|
for (const node of nodes) {
|
|
10
|
-
const
|
|
11
|
-
|
|
12
|
-
.
|
|
13
|
-
.
|
|
14
|
-
|
|
15
|
-
|
|
20
|
+
const targets = outgoingEdges.get(node.url);
|
|
21
|
+
if (targets) {
|
|
22
|
+
const broken = targets.filter(targetUrl => brokenNodes.has(targetUrl));
|
|
23
|
+
if (broken.length > 0) {
|
|
24
|
+
node.brokenLinks = broken;
|
|
25
|
+
}
|
|
16
26
|
}
|
|
17
27
|
}
|
|
18
28
|
// Authority Score (per node)
|
|
@@ -55,16 +65,11 @@ export function calculateMetrics(graph, _maxDepth) {
|
|
|
55
65
|
}
|
|
56
66
|
}
|
|
57
67
|
// topAuthorityPages: Top 10 by authority
|
|
58
|
-
const topAuthorityPages =
|
|
59
|
-
.
|
|
68
|
+
const topAuthorityPages = nodes
|
|
69
|
+
.filter(n => n.isInternal !== false && n.status > 0)
|
|
70
|
+
.map(n => ({ url: n.url, authority: getAuthority(n) }))
|
|
60
71
|
.sort((a, b) => b.authority - a.authority)
|
|
61
72
|
.slice(0, 10);
|
|
62
|
-
// topPageRankPages: Top 10 by raw PageRank
|
|
63
|
-
const topPageRankPages = [...nodes]
|
|
64
|
-
.filter(n => n.pageRank !== undefined)
|
|
65
|
-
.map(n => ({ url: n.url, score: n.pageRank }))
|
|
66
|
-
.sort((a, b) => b.score - a.score)
|
|
67
|
-
.slice(0, 10);
|
|
68
73
|
const averageOutDegree = totalPages > 0 ? totalEdges / totalPages : 0;
|
|
69
74
|
const maxDepthFound = nodes.reduce((max, n) => Math.max(max, n.depth), 0);
|
|
70
75
|
return {
|
|
@@ -79,7 +84,6 @@ export function calculateMetrics(graph, _maxDepth) {
|
|
|
79
84
|
crawlEfficiencyScore,
|
|
80
85
|
averageDepth,
|
|
81
86
|
structuralEntropy,
|
|
82
|
-
topPageRankPages,
|
|
83
87
|
limitReached: graph.limitReached,
|
|
84
88
|
sessionStats: graph.sessionStats
|
|
85
89
|
};
|
package/dist/graph/pagerank.d.ts
CHANGED
|
@@ -1,12 +1,25 @@
|
|
|
1
1
|
import { Graph } from './graph.js';
|
|
2
|
-
interface
|
|
2
|
+
export interface PageRankRow {
|
|
3
|
+
raw_rank: number;
|
|
4
|
+
score: number;
|
|
5
|
+
}
|
|
6
|
+
export interface PageRankOptions {
|
|
3
7
|
dampingFactor?: number;
|
|
4
8
|
maxIterations?: number;
|
|
5
9
|
convergenceThreshold?: number;
|
|
6
10
|
soft404WeightThreshold?: number;
|
|
11
|
+
neutralScoreWhenFlat?: number;
|
|
7
12
|
}
|
|
8
13
|
/**
|
|
9
|
-
*
|
|
14
|
+
* Service to analyze a site's link graph and compute PageRank metrics.
|
|
15
|
+
* Runs only on the full crawl graph.
|
|
10
16
|
*/
|
|
11
|
-
export declare
|
|
12
|
-
|
|
17
|
+
export declare class PageRankService {
|
|
18
|
+
/**
|
|
19
|
+
* Computes a Production-Grade Weighted PageRank over the given graph.
|
|
20
|
+
* @param {Graph} graph - The full site graph structure.
|
|
21
|
+
* @param {PageRankOptions} options - Configuration overrides for damping factor, limits, etc.
|
|
22
|
+
* @returns {Map<string, PageRankRow>} The individual metrics keyed by exact normalized url.
|
|
23
|
+
*/
|
|
24
|
+
evaluate(graph: Graph, options?: PageRankOptions): Map<string, PageRankRow>;
|
|
25
|
+
}
|
package/dist/graph/pagerank.js
CHANGED
|
@@ -1,104 +1,137 @@
|
|
|
1
|
+
import { DEFAULTS } from '../constants.js';
|
|
1
2
|
/**
|
|
2
|
-
*
|
|
3
|
+
* Service to analyze a site's link graph and compute PageRank metrics.
|
|
4
|
+
* Runs only on the full crawl graph.
|
|
3
5
|
*/
|
|
4
|
-
export
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
6
|
+
export class PageRankService {
|
|
7
|
+
/**
|
|
8
|
+
* Computes a Production-Grade Weighted PageRank over the given graph.
|
|
9
|
+
* @param {Graph} graph - The full site graph structure.
|
|
10
|
+
* @param {PageRankOptions} options - Configuration overrides for damping factor, limits, etc.
|
|
11
|
+
* @returns {Map<string, PageRankRow>} The individual metrics keyed by exact normalized url.
|
|
12
|
+
*/
|
|
13
|
+
evaluate(graph, options = {}) {
|
|
14
|
+
const d = options.dampingFactor ?? 0.85;
|
|
15
|
+
const maxIterations = options.maxIterations ?? 40;
|
|
16
|
+
const epsilon = options.convergenceThreshold ?? 1e-5;
|
|
17
|
+
const soft404Threshold = options.soft404WeightThreshold ?? 0.8;
|
|
18
|
+
const neutralScoreWhenFlat = options.neutralScoreWhenFlat ?? 50;
|
|
19
|
+
const allNodes = graph.getNodes();
|
|
20
|
+
const allEdges = graph.getEdges();
|
|
21
|
+
// 1. Filter Eligible Nodes
|
|
22
|
+
const eligibleNodes = allNodes.filter(node => {
|
|
23
|
+
if (node.noindex)
|
|
24
|
+
return false;
|
|
25
|
+
if (node.isCollapsed)
|
|
26
|
+
return false;
|
|
27
|
+
// Keep compat with other plugins mutating soft404Score onto nodes
|
|
28
|
+
if (node.soft404Score && node.soft404Score > soft404Threshold)
|
|
29
|
+
return false;
|
|
30
|
+
// canonical is stored as absolute URL; extract pathname for path-based comparison
|
|
31
|
+
if (node.canonical) {
|
|
32
|
+
try {
|
|
33
|
+
const canonicalPath = new URL(node.canonical).pathname;
|
|
34
|
+
if (canonicalPath !== node.url)
|
|
35
|
+
return false;
|
|
36
|
+
}
|
|
37
|
+
catch {
|
|
38
|
+
// if canonical isn't a valid URL, compare as-is
|
|
39
|
+
if (node.canonical !== node.url)
|
|
40
|
+
return false;
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
if (node.status >= 400)
|
|
44
|
+
return false; // Don't pass rank to broken pages
|
|
45
|
+
if (node.status === 0)
|
|
46
|
+
return false; // Don't pass rank to uncrawled/external pages
|
|
47
|
+
return true;
|
|
48
|
+
});
|
|
49
|
+
const nodeCount = eligibleNodes.length;
|
|
50
|
+
const results = new Map();
|
|
51
|
+
if (nodeCount === 0)
|
|
52
|
+
return results;
|
|
53
|
+
// Map URL to Index for O(1) access and TypedArray usage
|
|
54
|
+
const urlToIndex = new Map();
|
|
55
|
+
for (let i = 0; i < nodeCount; i++) {
|
|
56
|
+
urlToIndex.set(eligibleNodes[i].url, i);
|
|
55
57
|
}
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
const
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
58
|
+
// Pre-calculate weighted outbound sums and inverted adjacency
|
|
59
|
+
const outWeights = new Float64Array(nodeCount);
|
|
60
|
+
const incoming = new Array(nodeCount).fill(null).map(() => []);
|
|
61
|
+
for (const edge of allEdges) {
|
|
62
|
+
const sourceIndex = urlToIndex.get(edge.source);
|
|
63
|
+
const targetIndex = urlToIndex.get(edge.target);
|
|
64
|
+
if (sourceIndex !== undefined && targetIndex !== undefined) {
|
|
65
|
+
const weight = edge.weight || 1.0;
|
|
66
|
+
incoming[targetIndex].push({ sourceIndex, weight });
|
|
67
|
+
outWeights[sourceIndex] += weight;
|
|
68
|
+
}
|
|
64
69
|
}
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
const sourceRank = pr.get(edge.source) || 0;
|
|
71
|
-
const sourceOutWeight = outWeights.get(edge.source) || 1.0;
|
|
72
|
-
rankFromLinks += sourceRank * (edge.weight / sourceOutWeight);
|
|
70
|
+
// Identify sinks
|
|
71
|
+
const sinks = [];
|
|
72
|
+
for (let i = 0; i < nodeCount; i++) {
|
|
73
|
+
if (outWeights[i] === 0) {
|
|
74
|
+
sinks.push(i);
|
|
73
75
|
}
|
|
74
|
-
const newRank = baseRank + d * rankFromLinks;
|
|
75
|
-
nextPr.set(url, newRank);
|
|
76
76
|
}
|
|
77
|
-
//
|
|
78
|
-
let
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
77
|
+
// Initialize PageRank typed arrays
|
|
78
|
+
let pr = new Float64Array(nodeCount).fill(1 / nodeCount);
|
|
79
|
+
let nextPr = new Float64Array(nodeCount);
|
|
80
|
+
// Iterative Calculation
|
|
81
|
+
for (let iter = 0; iter < maxIterations; iter++) {
|
|
82
|
+
// Calculate total rank from sinks to redistribute
|
|
83
|
+
let sinkRankTotal = 0;
|
|
84
|
+
for (let i = 0; i < sinks.length; i++) {
|
|
85
|
+
sinkRankTotal += pr[sinks[i]];
|
|
86
|
+
}
|
|
87
|
+
const baseRank = (1 - d) / nodeCount + (d * sinkRankTotal / nodeCount);
|
|
88
|
+
let maxDelta = 0;
|
|
89
|
+
for (let i = 0; i < nodeCount; i++) {
|
|
90
|
+
let rankFromLinks = 0;
|
|
91
|
+
const sources = incoming[i];
|
|
92
|
+
for (let j = 0; j < sources.length; j++) {
|
|
93
|
+
const edge = sources[j];
|
|
94
|
+
const sourceRank = pr[edge.sourceIndex];
|
|
95
|
+
const sourceOutWeight = outWeights[edge.sourceIndex] || 1.0;
|
|
96
|
+
rankFromLinks += sourceRank * (edge.weight / sourceOutWeight);
|
|
97
|
+
}
|
|
98
|
+
const newRank = baseRank + d * rankFromLinks;
|
|
99
|
+
nextPr[i] = newRank;
|
|
100
|
+
const delta = Math.abs(newRank - pr[i]);
|
|
101
|
+
if (delta > maxDelta) {
|
|
102
|
+
maxDelta = delta;
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
// Swap arrays
|
|
106
|
+
const temp = pr;
|
|
107
|
+
pr = nextPr;
|
|
108
|
+
nextPr = temp;
|
|
109
|
+
if (maxDelta < epsilon)
|
|
110
|
+
break;
|
|
83
111
|
}
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
for (const node of eligibleNodes) {
|
|
94
|
-
const rawRank = pr.get(node.url);
|
|
95
|
-
node.pageRank = rawRank;
|
|
96
|
-
if (range > 1e-12) {
|
|
97
|
-
node.pageRankScore = 100 * (rawRank - minPR) / range;
|
|
112
|
+
// 2. Normalization (0-100)
|
|
113
|
+
let minPR = pr[0];
|
|
114
|
+
let maxPR = pr[0];
|
|
115
|
+
for (let i = 1; i < nodeCount; i++) {
|
|
116
|
+
const rank = pr[i];
|
|
117
|
+
if (rank < minPR)
|
|
118
|
+
minPR = rank;
|
|
119
|
+
if (rank > maxPR)
|
|
120
|
+
maxPR = rank;
|
|
98
121
|
}
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
122
|
+
const range = maxPR - minPR;
|
|
123
|
+
for (let i = 0; i < nodeCount; i++) {
|
|
124
|
+
const rawRank = pr[i];
|
|
125
|
+
const url = eligibleNodes[i].url;
|
|
126
|
+
let score = neutralScoreWhenFlat;
|
|
127
|
+
if (range > DEFAULTS.GRAPH_PRECISION) {
|
|
128
|
+
score = 100 * (rawRank - minPR) / range;
|
|
129
|
+
}
|
|
130
|
+
results.set(url, {
|
|
131
|
+
raw_rank: rawRank,
|
|
132
|
+
score: Number(score.toFixed(3))
|
|
133
|
+
});
|
|
102
134
|
}
|
|
135
|
+
return results;
|
|
103
136
|
}
|
|
104
137
|
}
|
package/dist/index.d.ts
CHANGED
|
@@ -1,26 +1,35 @@
|
|
|
1
|
+
export * from './scoring/health.js';
|
|
1
2
|
export * from './crawler/crawl.js';
|
|
2
3
|
export * from './crawler/normalize.js';
|
|
3
4
|
export * from './crawler/metricsRunner.js';
|
|
5
|
+
export * from './crawler/trap.js';
|
|
4
6
|
export * from './graph/metrics.js';
|
|
5
|
-
export * from './report/html.js';
|
|
6
|
-
export * from './report/crawl_template.js';
|
|
7
|
-
export * from './report/crawlExport.js';
|
|
8
7
|
export * from './graph/graph.js';
|
|
9
|
-
export * from './
|
|
10
|
-
export * from './scoring/orphanSeverity.js';
|
|
8
|
+
export * from './graph/simhash.js';
|
|
11
9
|
export * from './graph/pagerank.js';
|
|
12
|
-
export * from './graph/
|
|
13
|
-
export * from './
|
|
14
|
-
export * from './
|
|
15
|
-
export * from './scoring/hits.js';
|
|
10
|
+
export * from './graph/hits.js';
|
|
11
|
+
export * from './diff/compare.js';
|
|
12
|
+
export * from './diff/service.js';
|
|
16
13
|
export * from './analysis/analyze.js';
|
|
17
14
|
export * from './analysis/content.js';
|
|
18
15
|
export * from './analysis/seo.js';
|
|
19
16
|
export * from './analysis/images.js';
|
|
20
17
|
export * from './analysis/links.js';
|
|
18
|
+
export * from './analysis/scoring.js';
|
|
19
|
+
export * from './analysis/clustering.js';
|
|
20
|
+
export * from './analysis/duplicate.js';
|
|
21
|
+
export * from './analysis/soft404.js';
|
|
22
|
+
export * from './analysis/heading.js';
|
|
23
|
+
export * from './analysis/orphan.js';
|
|
21
24
|
export * from './audit/index.js';
|
|
22
25
|
export * from './audit/types.js';
|
|
26
|
+
export * from './report/html.js';
|
|
27
|
+
export * from './report/crawl_template.js';
|
|
28
|
+
export * from './report/crawlExport.js';
|
|
29
|
+
export * from './report/export.js';
|
|
30
|
+
export * from './report/insight.js';
|
|
23
31
|
export * from './db/index.js';
|
|
32
|
+
export * from './db/reset.js';
|
|
24
33
|
export * from './db/graphLoader.js';
|
|
25
34
|
export * from './db/repositories/SiteRepository.js';
|
|
26
35
|
export * from './db/repositories/SnapshotRepository.js';
|
|
@@ -30,4 +39,13 @@ export * from './db/repositories/MetricsRepository.js';
|
|
|
30
39
|
export * from './lock/lockManager.js';
|
|
31
40
|
export * from './lock/hashKey.js';
|
|
32
41
|
export * from './utils/version.js';
|
|
42
|
+
export * from './utils/secureConfig.js';
|
|
33
43
|
export * from './events.js';
|
|
44
|
+
export * from './plugin-system/plugin-types.js';
|
|
45
|
+
export * from './plugin-system/plugin-loader.js';
|
|
46
|
+
export * from './plugin-system/plugin-registry.js';
|
|
47
|
+
export * from './plugin-system/plugin-cli.js';
|
|
48
|
+
export * from './ports/index.js';
|
|
49
|
+
export * from './application/usecase.js';
|
|
50
|
+
export * from './application/usecases.js';
|
|
51
|
+
export { Command } from 'commander';
|
package/dist/index.js
CHANGED
|
@@ -1,26 +1,35 @@
|
|
|
1
|
+
export * from './scoring/health.js';
|
|
1
2
|
export * from './crawler/crawl.js';
|
|
2
3
|
export * from './crawler/normalize.js';
|
|
3
4
|
export * from './crawler/metricsRunner.js';
|
|
5
|
+
export * from './crawler/trap.js';
|
|
4
6
|
export * from './graph/metrics.js';
|
|
5
|
-
export * from './report/html.js';
|
|
6
|
-
export * from './report/crawl_template.js';
|
|
7
|
-
export * from './report/crawlExport.js';
|
|
8
7
|
export * from './graph/graph.js';
|
|
9
|
-
export * from './
|
|
10
|
-
export * from './scoring/orphanSeverity.js';
|
|
8
|
+
export * from './graph/simhash.js';
|
|
11
9
|
export * from './graph/pagerank.js';
|
|
12
|
-
export * from './graph/
|
|
13
|
-
export * from './
|
|
14
|
-
export * from './
|
|
15
|
-
export * from './scoring/hits.js';
|
|
10
|
+
export * from './graph/hits.js';
|
|
11
|
+
export * from './diff/compare.js';
|
|
12
|
+
export * from './diff/service.js';
|
|
16
13
|
export * from './analysis/analyze.js';
|
|
17
14
|
export * from './analysis/content.js';
|
|
18
15
|
export * from './analysis/seo.js';
|
|
19
16
|
export * from './analysis/images.js';
|
|
20
17
|
export * from './analysis/links.js';
|
|
18
|
+
export * from './analysis/scoring.js';
|
|
19
|
+
export * from './analysis/clustering.js';
|
|
20
|
+
export * from './analysis/duplicate.js';
|
|
21
|
+
export * from './analysis/soft404.js';
|
|
22
|
+
export * from './analysis/heading.js';
|
|
23
|
+
export * from './analysis/orphan.js';
|
|
21
24
|
export * from './audit/index.js';
|
|
22
25
|
export * from './audit/types.js';
|
|
26
|
+
export * from './report/html.js';
|
|
27
|
+
export * from './report/crawl_template.js';
|
|
28
|
+
export * from './report/crawlExport.js';
|
|
29
|
+
export * from './report/export.js';
|
|
30
|
+
export * from './report/insight.js';
|
|
23
31
|
export * from './db/index.js';
|
|
32
|
+
export * from './db/reset.js';
|
|
24
33
|
export * from './db/graphLoader.js';
|
|
25
34
|
export * from './db/repositories/SiteRepository.js';
|
|
26
35
|
export * from './db/repositories/SnapshotRepository.js';
|
|
@@ -30,4 +39,13 @@ export * from './db/repositories/MetricsRepository.js';
|
|
|
30
39
|
export * from './lock/lockManager.js';
|
|
31
40
|
export * from './lock/hashKey.js';
|
|
32
41
|
export * from './utils/version.js';
|
|
42
|
+
export * from './utils/secureConfig.js';
|
|
33
43
|
export * from './events.js';
|
|
44
|
+
export * from './plugin-system/plugin-types.js';
|
|
45
|
+
export * from './plugin-system/plugin-loader.js';
|
|
46
|
+
export * from './plugin-system/plugin-registry.js';
|
|
47
|
+
export * from './plugin-system/plugin-cli.js';
|
|
48
|
+
export * from './ports/index.js';
|
|
49
|
+
export * from './application/usecase.js';
|
|
50
|
+
export * from './application/usecases.js';
|
|
51
|
+
export { Command } from 'commander';
|
|
@@ -5,6 +5,7 @@ export declare class LockManager {
|
|
|
5
5
|
private static get lockDir();
|
|
6
6
|
static acquireLock(commandName: string, targetUrl: string, options: any, context?: EngineContext, force?: boolean): Promise<void>;
|
|
7
7
|
static releaseLock(): void;
|
|
8
|
+
static clearAllLocks(): Promise<number>;
|
|
8
9
|
private static log;
|
|
9
10
|
private static registerHandlers;
|
|
10
11
|
}
|
package/dist/lock/lockManager.js
CHANGED
|
@@ -85,6 +85,21 @@ export class LockManager {
|
|
|
85
85
|
}
|
|
86
86
|
}
|
|
87
87
|
}
|
|
88
|
+
static async clearAllLocks() {
|
|
89
|
+
if (!existsSync(this.lockDir))
|
|
90
|
+
return 0;
|
|
91
|
+
const files = await fs.readdir(this.lockDir);
|
|
92
|
+
const lockFiles = files.filter(f => f.endsWith('.lock'));
|
|
93
|
+
let count = 0;
|
|
94
|
+
for (const file of lockFiles) {
|
|
95
|
+
try {
|
|
96
|
+
await fs.unlink(path.join(this.lockDir, file));
|
|
97
|
+
count++;
|
|
98
|
+
}
|
|
99
|
+
catch { /* ignore */ }
|
|
100
|
+
}
|
|
101
|
+
return count;
|
|
102
|
+
}
|
|
88
103
|
static log(type, message, error) {
|
|
89
104
|
if (this.context) {
|
|
90
105
|
this.context.emit({ type, message, error });
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
import { Command } from 'commander';
|
|
2
|
+
/**
|
|
3
|
+
* Standard utility for plugins to register their configuration commands.
|
|
4
|
+
* This ensures a consistent 'config <plugin> set' CLI pattern across the ecosystem.
|
|
5
|
+
*
|
|
6
|
+
* @param cli - The main Commander instance (must have name 'crawlith').
|
|
7
|
+
* @param pluginName - The unique name of the plugin (e.g., 'pagespeed').
|
|
8
|
+
* @param credentialLabel - Human-readable label for the credential (e.g., 'Google API Key').
|
|
9
|
+
*/
|
|
10
|
+
export declare function registerPluginConfigCommand(cli: Command, pluginName: string, credentialLabel: string): void;
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
import { Command } from 'commander';
|
|
2
|
+
import { setEncryptedConfigKey } from '../utils/secureConfig.js';
|
|
3
|
+
/**
|
|
4
|
+
* Standard utility for plugins to register their configuration commands.
|
|
5
|
+
* This ensures a consistent 'config <plugin> set' CLI pattern across the ecosystem.
|
|
6
|
+
*
|
|
7
|
+
* @param cli - The main Commander instance (must have name 'crawlith').
|
|
8
|
+
* @param pluginName - The unique name of the plugin (e.g., 'pagespeed').
|
|
9
|
+
* @param credentialLabel - Human-readable label for the credential (e.g., 'Google API Key').
|
|
10
|
+
*/
|
|
11
|
+
export function registerPluginConfigCommand(cli, pluginName, credentialLabel) {
|
|
12
|
+
// Only register subcommands if we are in the root 'crawlith' CLI context
|
|
13
|
+
if (cli.name() !== 'crawlith')
|
|
14
|
+
return;
|
|
15
|
+
// Find or create 'config' command
|
|
16
|
+
let configCmd = cli.commands.find(c => c.name() === 'config');
|
|
17
|
+
if (!configCmd) {
|
|
18
|
+
configCmd = new Command('config').description('Manage Crawlith plugin configuration');
|
|
19
|
+
cli.addCommand(configCmd);
|
|
20
|
+
}
|
|
21
|
+
// Define plugin-specific subcommand
|
|
22
|
+
const pluginConfigCmd = new Command(pluginName).description(`Manage ${pluginName} configuration`);
|
|
23
|
+
pluginConfigCmd
|
|
24
|
+
.command('set <value>')
|
|
25
|
+
.description(`Set and encrypt ${credentialLabel}`)
|
|
26
|
+
.action((value) => {
|
|
27
|
+
setEncryptedConfigKey(pluginName, value);
|
|
28
|
+
console.log(`✅ ${credentialLabel} for ${pluginName} saved and encrypted.`);
|
|
29
|
+
});
|
|
30
|
+
configCmd.addCommand(pluginConfigCmd);
|
|
31
|
+
}
|