@crawlith/core 0.1.0 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +201 -0
- package/README.md +70 -0
- package/dist/analysis/analysis_list.html +35 -0
- package/dist/analysis/analysis_page.html +123 -0
- package/dist/analysis/analyze.d.ts +40 -5
- package/dist/analysis/analyze.js +395 -347
- package/dist/analysis/clustering.d.ts +23 -0
- package/dist/analysis/clustering.js +206 -0
- package/dist/analysis/content.d.ts +1 -1
- package/dist/analysis/content.js +11 -5
- package/dist/analysis/duplicate.d.ts +34 -0
- package/dist/analysis/duplicate.js +305 -0
- package/dist/analysis/heading.d.ts +116 -0
- package/dist/analysis/heading.js +356 -0
- package/dist/analysis/images.d.ts +1 -1
- package/dist/analysis/images.js +6 -5
- package/dist/analysis/links.d.ts +1 -1
- package/dist/analysis/links.js +8 -8
- package/dist/{scoring/orphanSeverity.d.ts → analysis/orphan.d.ts} +12 -23
- package/dist/{scoring/orphanSeverity.js → analysis/orphan.js} +9 -3
- package/dist/analysis/scoring.js +11 -2
- package/dist/analysis/seo.d.ts +8 -4
- package/dist/analysis/seo.js +41 -30
- package/dist/analysis/soft404.d.ts +17 -0
- package/dist/analysis/soft404.js +62 -0
- package/dist/analysis/structuredData.d.ts +1 -1
- package/dist/analysis/structuredData.js +5 -4
- package/dist/analysis/templates.d.ts +2 -0
- package/dist/analysis/templates.js +7 -0
- package/dist/application/index.d.ts +2 -0
- package/dist/application/index.js +2 -0
- package/dist/application/usecase.d.ts +3 -0
- package/dist/application/usecase.js +1 -0
- package/dist/application/usecases.d.ts +114 -0
- package/dist/application/usecases.js +201 -0
- package/dist/audit/index.js +1 -1
- package/dist/audit/transport.d.ts +1 -1
- package/dist/audit/transport.js +5 -4
- package/dist/audit/types.d.ts +1 -0
- package/dist/constants.d.ts +17 -0
- package/dist/constants.js +23 -0
- package/dist/core/scope/scopeManager.js +3 -0
- package/dist/core/security/ipGuard.d.ts +11 -0
- package/dist/core/security/ipGuard.js +71 -3
- package/dist/crawler/crawl.d.ts +4 -22
- package/dist/crawler/crawl.js +4 -335
- package/dist/crawler/crawler.d.ts +87 -0
- package/dist/crawler/crawler.js +683 -0
- package/dist/crawler/extract.d.ts +4 -1
- package/dist/crawler/extract.js +7 -2
- package/dist/crawler/fetcher.d.ts +2 -1
- package/dist/crawler/fetcher.js +26 -11
- package/dist/crawler/metricsRunner.d.ts +23 -1
- package/dist/crawler/metricsRunner.js +202 -72
- package/dist/crawler/normalize.d.ts +41 -0
- package/dist/crawler/normalize.js +119 -3
- package/dist/crawler/parser.d.ts +1 -3
- package/dist/crawler/parser.js +2 -49
- package/dist/crawler/resolver.d.ts +11 -0
- package/dist/crawler/resolver.js +67 -0
- package/dist/crawler/sitemap.d.ts +6 -0
- package/dist/crawler/sitemap.js +27 -17
- package/dist/crawler/trap.d.ts +5 -1
- package/dist/crawler/trap.js +23 -2
- package/dist/db/CrawlithDB.d.ts +110 -0
- package/dist/db/CrawlithDB.js +500 -0
- package/dist/db/graphLoader.js +42 -30
- package/dist/db/index.d.ts +11 -0
- package/dist/db/index.js +41 -29
- package/dist/db/migrations.d.ts +2 -0
- package/dist/db/{schema.js → migrations.js} +90 -43
- package/dist/db/pluginRegistry.d.ts +9 -0
- package/dist/db/pluginRegistry.js +19 -0
- package/dist/db/repositories/EdgeRepository.d.ts +13 -0
- package/dist/db/repositories/EdgeRepository.js +20 -0
- package/dist/db/repositories/MetricsRepository.d.ts +16 -8
- package/dist/db/repositories/MetricsRepository.js +28 -7
- package/dist/db/repositories/PageRepository.d.ts +15 -2
- package/dist/db/repositories/PageRepository.js +169 -25
- package/dist/db/repositories/SiteRepository.d.ts +9 -0
- package/dist/db/repositories/SiteRepository.js +13 -0
- package/dist/db/repositories/SnapshotRepository.d.ts +14 -5
- package/dist/db/repositories/SnapshotRepository.js +64 -5
- package/dist/db/reset.d.ts +9 -0
- package/dist/db/reset.js +32 -0
- package/dist/db/statements.d.ts +12 -0
- package/dist/db/statements.js +40 -0
- package/dist/diff/compare.d.ts +0 -5
- package/dist/diff/compare.js +0 -12
- package/dist/diff/service.d.ts +16 -0
- package/dist/diff/service.js +41 -0
- package/dist/domain/index.d.ts +4 -0
- package/dist/domain/index.js +4 -0
- package/dist/events.d.ts +56 -0
- package/dist/events.js +1 -0
- package/dist/graph/graph.d.ts +36 -42
- package/dist/graph/graph.js +26 -17
- package/dist/graph/hits.d.ts +23 -0
- package/dist/graph/hits.js +111 -0
- package/dist/graph/metrics.d.ts +0 -4
- package/dist/graph/metrics.js +25 -9
- package/dist/graph/pagerank.d.ts +17 -4
- package/dist/graph/pagerank.js +126 -91
- package/dist/graph/simhash.d.ts +6 -0
- package/dist/graph/simhash.js +14 -0
- package/dist/index.d.ts +29 -8
- package/dist/index.js +29 -8
- package/dist/lock/hashKey.js +1 -1
- package/dist/lock/lockManager.d.ts +5 -1
- package/dist/lock/lockManager.js +38 -13
- package/dist/plugin-system/plugin-cli.d.ts +10 -0
- package/dist/plugin-system/plugin-cli.js +31 -0
- package/dist/plugin-system/plugin-config.d.ts +16 -0
- package/dist/plugin-system/plugin-config.js +36 -0
- package/dist/plugin-system/plugin-loader.d.ts +17 -0
- package/dist/plugin-system/plugin-loader.js +122 -0
- package/dist/plugin-system/plugin-registry.d.ts +25 -0
- package/dist/plugin-system/plugin-registry.js +167 -0
- package/dist/plugin-system/plugin-types.d.ts +205 -0
- package/dist/plugin-system/plugin-types.js +1 -0
- package/dist/ports/index.d.ts +9 -0
- package/dist/ports/index.js +1 -0
- package/{src/report/sitegraph_template.ts → dist/report/crawl.html} +330 -81
- package/dist/report/crawlExport.d.ts +3 -0
- package/dist/report/{sitegraphExport.js → crawlExport.js} +3 -3
- package/dist/report/crawl_template.d.ts +1 -0
- package/dist/report/crawl_template.js +7 -0
- package/dist/report/export.d.ts +3 -0
- package/dist/report/export.js +81 -0
- package/dist/report/html.js +15 -216
- package/dist/report/insight.d.ts +27 -0
- package/dist/report/insight.js +103 -0
- package/dist/scoring/health.d.ts +56 -0
- package/dist/scoring/health.js +213 -0
- package/dist/utils/chalk.d.ts +6 -0
- package/dist/utils/chalk.js +41 -0
- package/dist/utils/secureConfig.d.ts +23 -0
- package/dist/utils/secureConfig.js +128 -0
- package/package.json +12 -6
- package/CHANGELOG.md +0 -7
- package/dist/db/schema.d.ts +0 -2
- package/dist/graph/cluster.d.ts +0 -6
- package/dist/graph/cluster.js +0 -173
- package/dist/graph/duplicate.d.ts +0 -10
- package/dist/graph/duplicate.js +0 -251
- package/dist/report/sitegraphExport.d.ts +0 -3
- package/dist/report/sitegraph_template.d.ts +0 -1
- package/dist/report/sitegraph_template.js +0 -630
- package/dist/scoring/hits.d.ts +0 -9
- package/dist/scoring/hits.js +0 -111
- package/src/analysis/analyze.ts +0 -548
- package/src/analysis/content.ts +0 -62
- package/src/analysis/images.ts +0 -28
- package/src/analysis/links.ts +0 -41
- package/src/analysis/scoring.ts +0 -59
- package/src/analysis/seo.ts +0 -82
- package/src/analysis/structuredData.ts +0 -62
- package/src/audit/dns.ts +0 -49
- package/src/audit/headers.ts +0 -98
- package/src/audit/index.ts +0 -66
- package/src/audit/scoring.ts +0 -232
- package/src/audit/transport.ts +0 -258
- package/src/audit/types.ts +0 -102
- package/src/core/network/proxyAdapter.ts +0 -21
- package/src/core/network/rateLimiter.ts +0 -39
- package/src/core/network/redirectController.ts +0 -47
- package/src/core/network/responseLimiter.ts +0 -34
- package/src/core/network/retryPolicy.ts +0 -57
- package/src/core/scope/domainFilter.ts +0 -45
- package/src/core/scope/scopeManager.ts +0 -52
- package/src/core/scope/subdomainPolicy.ts +0 -39
- package/src/core/security/ipGuard.ts +0 -92
- package/src/crawler/crawl.ts +0 -382
- package/src/crawler/extract.ts +0 -34
- package/src/crawler/fetcher.ts +0 -233
- package/src/crawler/metricsRunner.ts +0 -124
- package/src/crawler/normalize.ts +0 -108
- package/src/crawler/parser.ts +0 -190
- package/src/crawler/sitemap.ts +0 -73
- package/src/crawler/trap.ts +0 -96
- package/src/db/graphLoader.ts +0 -105
- package/src/db/index.ts +0 -70
- package/src/db/repositories/EdgeRepository.ts +0 -29
- package/src/db/repositories/MetricsRepository.ts +0 -49
- package/src/db/repositories/PageRepository.ts +0 -128
- package/src/db/repositories/SiteRepository.ts +0 -32
- package/src/db/repositories/SnapshotRepository.ts +0 -74
- package/src/db/schema.ts +0 -177
- package/src/diff/compare.ts +0 -84
- package/src/graph/cluster.ts +0 -192
- package/src/graph/duplicate.ts +0 -286
- package/src/graph/graph.ts +0 -172
- package/src/graph/metrics.ts +0 -110
- package/src/graph/pagerank.ts +0 -125
- package/src/graph/simhash.ts +0 -61
- package/src/index.ts +0 -30
- package/src/lock/hashKey.ts +0 -51
- package/src/lock/lockManager.ts +0 -124
- package/src/lock/pidCheck.ts +0 -13
- package/src/report/html.ts +0 -227
- package/src/report/sitegraphExport.ts +0 -58
- package/src/scoring/hits.ts +0 -131
- package/src/scoring/orphanSeverity.ts +0 -176
- package/src/utils/version.ts +0 -18
- package/tests/__snapshots__/orphanSeverity.test.ts.snap +0 -49
- package/tests/analysis.unit.test.ts +0 -98
- package/tests/analyze.integration.test.ts +0 -98
- package/tests/audit/dns.test.ts +0 -31
- package/tests/audit/headers.test.ts +0 -45
- package/tests/audit/scoring.test.ts +0 -133
- package/tests/audit/security.test.ts +0 -12
- package/tests/audit/transport.test.ts +0 -112
- package/tests/clustering.test.ts +0 -118
- package/tests/crawler.test.ts +0 -358
- package/tests/db.test.ts +0 -159
- package/tests/diff.test.ts +0 -67
- package/tests/duplicate.test.ts +0 -110
- package/tests/fetcher.test.ts +0 -106
- package/tests/fetcher_safety.test.ts +0 -85
- package/tests/fixtures/analyze-crawl.json +0 -26
- package/tests/hits.test.ts +0 -134
- package/tests/html_report.test.ts +0 -58
- package/tests/lock/lockManager.test.ts +0 -138
- package/tests/metrics.test.ts +0 -196
- package/tests/normalize.test.ts +0 -101
- package/tests/orphanSeverity.test.ts +0 -160
- package/tests/pagerank.test.ts +0 -98
- package/tests/parser.test.ts +0 -117
- package/tests/proxy_safety.test.ts +0 -57
- package/tests/redirect_safety.test.ts +0 -73
- package/tests/safety.test.ts +0 -114
- package/tests/scope.test.ts +0 -66
- package/tests/scoring.test.ts +0 -59
- package/tests/sitemap.test.ts +0 -88
- package/tests/soft404.test.ts +0 -41
- package/tests/trap.test.ts +0 -39
- package/tests/visualization_data.test.ts +0 -46
- package/tsconfig.json +0 -11
package/dist/graph/pagerank.js
CHANGED
|
@@ -1,102 +1,137 @@
|
|
|
1
|
+
import { DEFAULTS } from '../constants.js';
|
|
1
2
|
/**
|
|
2
|
-
*
|
|
3
|
+
* Service to analyze a site's link graph and compute PageRank metrics.
|
|
4
|
+
* Runs only on the full crawl graph.
|
|
3
5
|
*/
|
|
4
|
-
export
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
6
|
+
export class PageRankService {
|
|
7
|
+
/**
|
|
8
|
+
* Computes a Production-Grade Weighted PageRank over the given graph.
|
|
9
|
+
* @param {Graph} graph - The full site graph structure.
|
|
10
|
+
* @param {PageRankOptions} options - Configuration overrides for damping factor, limits, etc.
|
|
11
|
+
* @returns {Map<string, PageRankRow>} The individual metrics keyed by exact normalized url.
|
|
12
|
+
*/
|
|
13
|
+
evaluate(graph, options = {}) {
|
|
14
|
+
const d = options.dampingFactor ?? 0.85;
|
|
15
|
+
const maxIterations = options.maxIterations ?? 40;
|
|
16
|
+
const epsilon = options.convergenceThreshold ?? 1e-5;
|
|
17
|
+
const soft404Threshold = options.soft404WeightThreshold ?? 0.8;
|
|
18
|
+
const neutralScoreWhenFlat = options.neutralScoreWhenFlat ?? 50;
|
|
19
|
+
const allNodes = graph.getNodes();
|
|
20
|
+
const allEdges = graph.getEdges();
|
|
21
|
+
// 1. Filter Eligible Nodes
|
|
22
|
+
const eligibleNodes = allNodes.filter(node => {
|
|
23
|
+
if (node.noindex)
|
|
24
|
+
return false;
|
|
25
|
+
if (node.isCollapsed)
|
|
26
|
+
return false;
|
|
27
|
+
// Keep compat with other plugins mutating soft404Score onto nodes
|
|
28
|
+
if (node.soft404Score && node.soft404Score > soft404Threshold)
|
|
29
|
+
return false;
|
|
30
|
+
// canonical is stored as absolute URL; extract pathname for path-based comparison
|
|
31
|
+
if (node.canonical) {
|
|
32
|
+
try {
|
|
33
|
+
const canonicalPath = new URL(node.canonical).pathname;
|
|
34
|
+
if (canonicalPath !== node.url)
|
|
35
|
+
return false;
|
|
36
|
+
}
|
|
37
|
+
catch {
|
|
38
|
+
// if canonical isn't a valid URL, compare as-is
|
|
39
|
+
if (node.canonical !== node.url)
|
|
40
|
+
return false;
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
if (node.status >= 400)
|
|
44
|
+
return false; // Don't pass rank to broken pages
|
|
45
|
+
if (node.status === 0)
|
|
46
|
+
return false; // Don't pass rank to uncrawled/external pages
|
|
47
|
+
return true;
|
|
48
|
+
});
|
|
49
|
+
const nodeCount = eligibleNodes.length;
|
|
50
|
+
const results = new Map();
|
|
51
|
+
if (nodeCount === 0)
|
|
52
|
+
return results;
|
|
53
|
+
// Map URL to Index for O(1) access and TypedArray usage
|
|
54
|
+
const urlToIndex = new Map();
|
|
55
|
+
for (let i = 0; i < nodeCount; i++) {
|
|
56
|
+
urlToIndex.set(eligibleNodes[i].url, i);
|
|
53
57
|
}
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
const
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
58
|
+
// Pre-calculate weighted outbound sums and inverted adjacency
|
|
59
|
+
const outWeights = new Float64Array(nodeCount);
|
|
60
|
+
const incoming = new Array(nodeCount).fill(null).map(() => []);
|
|
61
|
+
for (const edge of allEdges) {
|
|
62
|
+
const sourceIndex = urlToIndex.get(edge.source);
|
|
63
|
+
const targetIndex = urlToIndex.get(edge.target);
|
|
64
|
+
if (sourceIndex !== undefined && targetIndex !== undefined) {
|
|
65
|
+
const weight = edge.weight || 1.0;
|
|
66
|
+
incoming[targetIndex].push({ sourceIndex, weight });
|
|
67
|
+
outWeights[sourceIndex] += weight;
|
|
68
|
+
}
|
|
62
69
|
}
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
const sourceRank = pr.get(edge.source) || 0;
|
|
69
|
-
const sourceOutWeight = outWeights.get(edge.source) || 1.0;
|
|
70
|
-
rankFromLinks += sourceRank * (edge.weight / sourceOutWeight);
|
|
70
|
+
// Identify sinks
|
|
71
|
+
const sinks = [];
|
|
72
|
+
for (let i = 0; i < nodeCount; i++) {
|
|
73
|
+
if (outWeights[i] === 0) {
|
|
74
|
+
sinks.push(i);
|
|
71
75
|
}
|
|
72
|
-
const newRank = baseRank + d * rankFromLinks;
|
|
73
|
-
nextPr.set(url, newRank);
|
|
74
76
|
}
|
|
75
|
-
//
|
|
76
|
-
let
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
77
|
+
// Initialize PageRank typed arrays
|
|
78
|
+
let pr = new Float64Array(nodeCount).fill(1 / nodeCount);
|
|
79
|
+
let nextPr = new Float64Array(nodeCount);
|
|
80
|
+
// Iterative Calculation
|
|
81
|
+
for (let iter = 0; iter < maxIterations; iter++) {
|
|
82
|
+
// Calculate total rank from sinks to redistribute
|
|
83
|
+
let sinkRankTotal = 0;
|
|
84
|
+
for (let i = 0; i < sinks.length; i++) {
|
|
85
|
+
sinkRankTotal += pr[sinks[i]];
|
|
86
|
+
}
|
|
87
|
+
const baseRank = (1 - d) / nodeCount + (d * sinkRankTotal / nodeCount);
|
|
88
|
+
let maxDelta = 0;
|
|
89
|
+
for (let i = 0; i < nodeCount; i++) {
|
|
90
|
+
let rankFromLinks = 0;
|
|
91
|
+
const sources = incoming[i];
|
|
92
|
+
for (let j = 0; j < sources.length; j++) {
|
|
93
|
+
const edge = sources[j];
|
|
94
|
+
const sourceRank = pr[edge.sourceIndex];
|
|
95
|
+
const sourceOutWeight = outWeights[edge.sourceIndex] || 1.0;
|
|
96
|
+
rankFromLinks += sourceRank * (edge.weight / sourceOutWeight);
|
|
97
|
+
}
|
|
98
|
+
const newRank = baseRank + d * rankFromLinks;
|
|
99
|
+
nextPr[i] = newRank;
|
|
100
|
+
const delta = Math.abs(newRank - pr[i]);
|
|
101
|
+
if (delta > maxDelta) {
|
|
102
|
+
maxDelta = delta;
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
// Swap arrays
|
|
106
|
+
const temp = pr;
|
|
107
|
+
pr = nextPr;
|
|
108
|
+
nextPr = temp;
|
|
109
|
+
if (maxDelta < epsilon)
|
|
110
|
+
break;
|
|
81
111
|
}
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
for (const node of eligibleNodes) {
|
|
92
|
-
const rawRank = pr.get(node.url);
|
|
93
|
-
node.pageRank = rawRank;
|
|
94
|
-
if (range > 1e-12) {
|
|
95
|
-
node.pageRankScore = 100 * (rawRank - minPR) / range;
|
|
112
|
+
// 2. Normalization (0-100)
|
|
113
|
+
let minPR = pr[0];
|
|
114
|
+
let maxPR = pr[0];
|
|
115
|
+
for (let i = 1; i < nodeCount; i++) {
|
|
116
|
+
const rank = pr[i];
|
|
117
|
+
if (rank < minPR)
|
|
118
|
+
minPR = rank;
|
|
119
|
+
if (rank > maxPR)
|
|
120
|
+
maxPR = rank;
|
|
96
121
|
}
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
122
|
+
const range = maxPR - minPR;
|
|
123
|
+
for (let i = 0; i < nodeCount; i++) {
|
|
124
|
+
const rawRank = pr[i];
|
|
125
|
+
const url = eligibleNodes[i].url;
|
|
126
|
+
let score = neutralScoreWhenFlat;
|
|
127
|
+
if (range > DEFAULTS.GRAPH_PRECISION) {
|
|
128
|
+
score = 100 * (rawRank - minPR) / range;
|
|
129
|
+
}
|
|
130
|
+
results.set(url, {
|
|
131
|
+
raw_rank: rawRank,
|
|
132
|
+
score: Number(score.toFixed(3))
|
|
133
|
+
});
|
|
100
134
|
}
|
|
135
|
+
return results;
|
|
101
136
|
}
|
|
102
137
|
}
|
package/dist/graph/simhash.d.ts
CHANGED
|
@@ -2,6 +2,8 @@ export declare class SimHash {
|
|
|
2
2
|
private static FNV_PRIME;
|
|
3
3
|
private static FNV_OFFSET_BASIS;
|
|
4
4
|
private static MAX_UINT64;
|
|
5
|
+
static readonly BANDS = 4;
|
|
6
|
+
static readonly BAND_WIDTH = 16;
|
|
5
7
|
/**
|
|
6
8
|
* Generates a 64-bit FNV-1a hash for a given string token.
|
|
7
9
|
*/
|
|
@@ -10,6 +12,10 @@ export declare class SimHash {
|
|
|
10
12
|
* Generates a 64-bit SimHash from an array of tokens.
|
|
11
13
|
*/
|
|
12
14
|
static generate(tokens: string[]): bigint;
|
|
15
|
+
/**
|
|
16
|
+
* Splits a 64-bit SimHash into 4 bands of 16 bits.
|
|
17
|
+
*/
|
|
18
|
+
static getBands(simhash: bigint): number[];
|
|
13
19
|
/**
|
|
14
20
|
* Computes the Hamming distance between two 64-bit hashes.
|
|
15
21
|
*/
|
package/dist/graph/simhash.js
CHANGED
|
@@ -2,6 +2,8 @@ export class SimHash {
|
|
|
2
2
|
static FNV_PRIME = 1099511628211n;
|
|
3
3
|
static FNV_OFFSET_BASIS = 14695981039346656037n;
|
|
4
4
|
static MAX_UINT64 = 0xffffffffffffffffn;
|
|
5
|
+
static BANDS = 4;
|
|
6
|
+
static BAND_WIDTH = 16;
|
|
5
7
|
/**
|
|
6
8
|
* Generates a 64-bit FNV-1a hash for a given string token.
|
|
7
9
|
*/
|
|
@@ -40,6 +42,18 @@ export class SimHash {
|
|
|
40
42
|
}
|
|
41
43
|
return simhash;
|
|
42
44
|
}
|
|
45
|
+
/**
|
|
46
|
+
* Splits a 64-bit SimHash into 4 bands of 16 bits.
|
|
47
|
+
*/
|
|
48
|
+
static getBands(simhash) {
|
|
49
|
+
const bands = [];
|
|
50
|
+
for (let i = 0; i < SimHash.BANDS; i++) {
|
|
51
|
+
// Extract 16-bit chunks
|
|
52
|
+
const chunk = Number((simhash >> BigInt(i * SimHash.BAND_WIDTH)) & 0xffffn);
|
|
53
|
+
bands.push(chunk);
|
|
54
|
+
}
|
|
55
|
+
return bands;
|
|
56
|
+
}
|
|
43
57
|
/**
|
|
44
58
|
* Computes the Hamming distance between two 64-bit hashes.
|
|
45
59
|
*/
|
package/dist/index.d.ts
CHANGED
|
@@ -1,24 +1,35 @@
|
|
|
1
|
+
export * from './scoring/health.js';
|
|
1
2
|
export * from './crawler/crawl.js';
|
|
3
|
+
export * from './crawler/normalize.js';
|
|
2
4
|
export * from './crawler/metricsRunner.js';
|
|
5
|
+
export * from './crawler/trap.js';
|
|
3
6
|
export * from './graph/metrics.js';
|
|
4
|
-
export * from './report/html.js';
|
|
5
|
-
export * from './report/sitegraph_template.js';
|
|
6
|
-
export * from './report/sitegraphExport.js';
|
|
7
7
|
export * from './graph/graph.js';
|
|
8
|
-
export * from './
|
|
9
|
-
export * from './scoring/orphanSeverity.js';
|
|
8
|
+
export * from './graph/simhash.js';
|
|
10
9
|
export * from './graph/pagerank.js';
|
|
11
|
-
export * from './graph/
|
|
12
|
-
export * from './
|
|
13
|
-
export * from './
|
|
10
|
+
export * from './graph/hits.js';
|
|
11
|
+
export * from './diff/compare.js';
|
|
12
|
+
export * from './diff/service.js';
|
|
14
13
|
export * from './analysis/analyze.js';
|
|
15
14
|
export * from './analysis/content.js';
|
|
16
15
|
export * from './analysis/seo.js';
|
|
17
16
|
export * from './analysis/images.js';
|
|
18
17
|
export * from './analysis/links.js';
|
|
18
|
+
export * from './analysis/scoring.js';
|
|
19
|
+
export * from './analysis/clustering.js';
|
|
20
|
+
export * from './analysis/duplicate.js';
|
|
21
|
+
export * from './analysis/soft404.js';
|
|
22
|
+
export * from './analysis/heading.js';
|
|
23
|
+
export * from './analysis/orphan.js';
|
|
19
24
|
export * from './audit/index.js';
|
|
20
25
|
export * from './audit/types.js';
|
|
26
|
+
export * from './report/html.js';
|
|
27
|
+
export * from './report/crawl_template.js';
|
|
28
|
+
export * from './report/crawlExport.js';
|
|
29
|
+
export * from './report/export.js';
|
|
30
|
+
export * from './report/insight.js';
|
|
21
31
|
export * from './db/index.js';
|
|
32
|
+
export * from './db/reset.js';
|
|
22
33
|
export * from './db/graphLoader.js';
|
|
23
34
|
export * from './db/repositories/SiteRepository.js';
|
|
24
35
|
export * from './db/repositories/SnapshotRepository.js';
|
|
@@ -28,3 +39,13 @@ export * from './db/repositories/MetricsRepository.js';
|
|
|
28
39
|
export * from './lock/lockManager.js';
|
|
29
40
|
export * from './lock/hashKey.js';
|
|
30
41
|
export * from './utils/version.js';
|
|
42
|
+
export * from './utils/secureConfig.js';
|
|
43
|
+
export * from './events.js';
|
|
44
|
+
export * from './plugin-system/plugin-types.js';
|
|
45
|
+
export * from './plugin-system/plugin-loader.js';
|
|
46
|
+
export * from './plugin-system/plugin-registry.js';
|
|
47
|
+
export * from './plugin-system/plugin-cli.js';
|
|
48
|
+
export * from './ports/index.js';
|
|
49
|
+
export * from './application/usecase.js';
|
|
50
|
+
export * from './application/usecases.js';
|
|
51
|
+
export { Command } from 'commander';
|
package/dist/index.js
CHANGED
|
@@ -1,24 +1,35 @@
|
|
|
1
|
+
export * from './scoring/health.js';
|
|
1
2
|
export * from './crawler/crawl.js';
|
|
3
|
+
export * from './crawler/normalize.js';
|
|
2
4
|
export * from './crawler/metricsRunner.js';
|
|
5
|
+
export * from './crawler/trap.js';
|
|
3
6
|
export * from './graph/metrics.js';
|
|
4
|
-
export * from './report/html.js';
|
|
5
|
-
export * from './report/sitegraph_template.js';
|
|
6
|
-
export * from './report/sitegraphExport.js';
|
|
7
7
|
export * from './graph/graph.js';
|
|
8
|
-
export * from './
|
|
9
|
-
export * from './scoring/orphanSeverity.js';
|
|
8
|
+
export * from './graph/simhash.js';
|
|
10
9
|
export * from './graph/pagerank.js';
|
|
11
|
-
export * from './graph/
|
|
12
|
-
export * from './
|
|
13
|
-
export * from './
|
|
10
|
+
export * from './graph/hits.js';
|
|
11
|
+
export * from './diff/compare.js';
|
|
12
|
+
export * from './diff/service.js';
|
|
14
13
|
export * from './analysis/analyze.js';
|
|
15
14
|
export * from './analysis/content.js';
|
|
16
15
|
export * from './analysis/seo.js';
|
|
17
16
|
export * from './analysis/images.js';
|
|
18
17
|
export * from './analysis/links.js';
|
|
18
|
+
export * from './analysis/scoring.js';
|
|
19
|
+
export * from './analysis/clustering.js';
|
|
20
|
+
export * from './analysis/duplicate.js';
|
|
21
|
+
export * from './analysis/soft404.js';
|
|
22
|
+
export * from './analysis/heading.js';
|
|
23
|
+
export * from './analysis/orphan.js';
|
|
19
24
|
export * from './audit/index.js';
|
|
20
25
|
export * from './audit/types.js';
|
|
26
|
+
export * from './report/html.js';
|
|
27
|
+
export * from './report/crawl_template.js';
|
|
28
|
+
export * from './report/crawlExport.js';
|
|
29
|
+
export * from './report/export.js';
|
|
30
|
+
export * from './report/insight.js';
|
|
21
31
|
export * from './db/index.js';
|
|
32
|
+
export * from './db/reset.js';
|
|
22
33
|
export * from './db/graphLoader.js';
|
|
23
34
|
export * from './db/repositories/SiteRepository.js';
|
|
24
35
|
export * from './db/repositories/SnapshotRepository.js';
|
|
@@ -28,3 +39,13 @@ export * from './db/repositories/MetricsRepository.js';
|
|
|
28
39
|
export * from './lock/lockManager.js';
|
|
29
40
|
export * from './lock/hashKey.js';
|
|
30
41
|
export * from './utils/version.js';
|
|
42
|
+
export * from './utils/secureConfig.js';
|
|
43
|
+
export * from './events.js';
|
|
44
|
+
export * from './plugin-system/plugin-types.js';
|
|
45
|
+
export * from './plugin-system/plugin-loader.js';
|
|
46
|
+
export * from './plugin-system/plugin-registry.js';
|
|
47
|
+
export * from './plugin-system/plugin-cli.js';
|
|
48
|
+
export * from './ports/index.js';
|
|
49
|
+
export * from './application/usecase.js';
|
|
50
|
+
export * from './application/usecases.js';
|
|
51
|
+
export { Command } from 'commander';
|
package/dist/lock/hashKey.js
CHANGED
|
@@ -20,7 +20,7 @@ const RELEVANT_FLAGS = [
|
|
|
20
20
|
'concurrency'
|
|
21
21
|
];
|
|
22
22
|
export function generateLockKey(commandName, targetUrl, options) {
|
|
23
|
-
// Respect the query stripping option consistent with
|
|
23
|
+
// Respect the query stripping option consistent with crawl logic
|
|
24
24
|
const stripQuery = !options.query;
|
|
25
25
|
const normalizedTarget = normalizeUrl(targetUrl, '', { stripQuery }) || targetUrl;
|
|
26
26
|
// Extract relevant options in a deterministic order
|
|
@@ -1,7 +1,11 @@
|
|
|
1
|
+
import { EngineContext } from '../events.js';
|
|
1
2
|
export declare class LockManager {
|
|
2
3
|
private static lockFilePath;
|
|
4
|
+
private static context;
|
|
3
5
|
private static get lockDir();
|
|
4
|
-
static acquireLock(commandName: string, targetUrl: string, options: any, force?: boolean): Promise<void>;
|
|
6
|
+
static acquireLock(commandName: string, targetUrl: string, options: any, context?: EngineContext, force?: boolean): Promise<void>;
|
|
5
7
|
static releaseLock(): void;
|
|
8
|
+
static clearAllLocks(): Promise<number>;
|
|
9
|
+
private static log;
|
|
6
10
|
private static registerHandlers;
|
|
7
11
|
}
|
package/dist/lock/lockManager.js
CHANGED
|
@@ -2,18 +2,18 @@ import fs from 'node:fs/promises';
|
|
|
2
2
|
import { existsSync, unlinkSync, readFileSync } from 'node:fs';
|
|
3
3
|
import path from 'node:path';
|
|
4
4
|
import os from 'node:os';
|
|
5
|
-
import chalk from 'chalk';
|
|
6
5
|
import { generateLockKey } from './hashKey.js';
|
|
7
6
|
import { isPidAlive } from './pidCheck.js';
|
|
8
7
|
export class LockManager {
|
|
9
8
|
static lockFilePath = null;
|
|
9
|
+
static context = null;
|
|
10
10
|
static get lockDir() {
|
|
11
11
|
return path.join(os.homedir(), '.crawlith', 'locks');
|
|
12
12
|
}
|
|
13
|
-
static async acquireLock(commandName, targetUrl, options, force = false) {
|
|
13
|
+
static async acquireLock(commandName, targetUrl, options, context, force = false) {
|
|
14
|
+
this.context = context || null;
|
|
14
15
|
const lockHash = generateLockKey(commandName, targetUrl, options);
|
|
15
16
|
// Ensure lock directory exists
|
|
16
|
-
// We can use sync or async here. Since this is one-time setup, async is fine.
|
|
17
17
|
await fs.mkdir(this.lockDir, { recursive: true });
|
|
18
18
|
const lockPath = path.join(this.lockDir, `${lockHash}.lock`);
|
|
19
19
|
// Check existing lock
|
|
@@ -29,10 +29,10 @@ export class LockManager {
|
|
|
29
29
|
catch (_e) {
|
|
30
30
|
// Corrupted -> Treat as stale
|
|
31
31
|
isStale = true;
|
|
32
|
-
pid = 0;
|
|
32
|
+
pid = 0;
|
|
33
33
|
}
|
|
34
34
|
if (force) {
|
|
35
|
-
|
|
35
|
+
this.log('warn', 'Force mode enabled. Overriding existing lock.');
|
|
36
36
|
try {
|
|
37
37
|
unlinkSync(lockPath);
|
|
38
38
|
}
|
|
@@ -40,11 +40,11 @@ export class LockManager {
|
|
|
40
40
|
}
|
|
41
41
|
else {
|
|
42
42
|
if (!isStale) {
|
|
43
|
-
|
|
43
|
+
this.log('error', `Crawlith: command already running for ${targetUrl} (PID ${pid})`);
|
|
44
44
|
process.exit(1);
|
|
45
45
|
}
|
|
46
46
|
else {
|
|
47
|
-
|
|
47
|
+
this.log('info', 'Detected stale lock. Continuing execution.');
|
|
48
48
|
try {
|
|
49
49
|
unlinkSync(lockPath);
|
|
50
50
|
}
|
|
@@ -68,8 +68,7 @@ export class LockManager {
|
|
|
68
68
|
}
|
|
69
69
|
catch (error) {
|
|
70
70
|
if (error.code === 'EEXIST') {
|
|
71
|
-
|
|
72
|
-
console.error(chalk.red(`Crawlith: command already running for ${targetUrl} (Race condition)`));
|
|
71
|
+
this.log('error', `Crawlith: command already running for ${targetUrl} (Race condition)`);
|
|
73
72
|
process.exit(1);
|
|
74
73
|
}
|
|
75
74
|
throw error;
|
|
@@ -86,15 +85,41 @@ export class LockManager {
|
|
|
86
85
|
}
|
|
87
86
|
}
|
|
88
87
|
}
|
|
88
|
+
static async clearAllLocks() {
|
|
89
|
+
if (!existsSync(this.lockDir))
|
|
90
|
+
return 0;
|
|
91
|
+
const files = await fs.readdir(this.lockDir);
|
|
92
|
+
const lockFiles = files.filter(f => f.endsWith('.lock'));
|
|
93
|
+
let count = 0;
|
|
94
|
+
for (const file of lockFiles) {
|
|
95
|
+
try {
|
|
96
|
+
await fs.unlink(path.join(this.lockDir, file));
|
|
97
|
+
count++;
|
|
98
|
+
}
|
|
99
|
+
catch { /* ignore */ }
|
|
100
|
+
}
|
|
101
|
+
return count;
|
|
102
|
+
}
|
|
103
|
+
static log(type, message, error) {
|
|
104
|
+
if (this.context) {
|
|
105
|
+
this.context.emit({ type, message, error });
|
|
106
|
+
}
|
|
107
|
+
else {
|
|
108
|
+
// Fallback for legacy usage or when no context provided
|
|
109
|
+
if (type === 'error')
|
|
110
|
+
console.error(message, error || '');
|
|
111
|
+
else if (type === 'warn')
|
|
112
|
+
console.warn(message);
|
|
113
|
+
else
|
|
114
|
+
console.log(message);
|
|
115
|
+
}
|
|
116
|
+
}
|
|
89
117
|
static registerHandlers() {
|
|
90
118
|
// Ensure cleanup only happens once
|
|
91
119
|
const cleanup = () => {
|
|
92
120
|
this.releaseLock();
|
|
93
121
|
};
|
|
94
|
-
// process.on('exit') is only called when process.exit() is called or event loop empties.
|
|
95
|
-
// It requires synchronous cleanup.
|
|
96
122
|
process.on('exit', cleanup);
|
|
97
|
-
// Signals
|
|
98
123
|
process.on('SIGINT', () => {
|
|
99
124
|
cleanup();
|
|
100
125
|
process.exit(130);
|
|
@@ -104,7 +129,7 @@ export class LockManager {
|
|
|
104
129
|
process.exit(143);
|
|
105
130
|
});
|
|
106
131
|
process.on('uncaughtException', (err) => {
|
|
107
|
-
|
|
132
|
+
this.log('error', 'Uncaught Exception', err);
|
|
108
133
|
cleanup();
|
|
109
134
|
process.exit(1);
|
|
110
135
|
});
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
import { Command } from 'commander';
|
|
2
|
+
/**
|
|
3
|
+
* Standard utility for plugins to register their configuration commands.
|
|
4
|
+
* This ensures a consistent 'config <plugin> set' CLI pattern across the ecosystem.
|
|
5
|
+
*
|
|
6
|
+
* @param cli - The main Commander instance (must have name 'crawlith').
|
|
7
|
+
* @param pluginName - The unique name of the plugin (e.g., 'pagespeed').
|
|
8
|
+
* @param credentialLabel - Human-readable label for the credential (e.g., 'Google API Key').
|
|
9
|
+
*/
|
|
10
|
+
export declare function registerPluginConfigCommand(cli: Command, pluginName: string, credentialLabel: string): void;
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
import { Command } from 'commander';
|
|
2
|
+
import { setEncryptedConfigKey } from '../utils/secureConfig.js';
|
|
3
|
+
/**
|
|
4
|
+
* Standard utility for plugins to register their configuration commands.
|
|
5
|
+
* This ensures a consistent 'config <plugin> set' CLI pattern across the ecosystem.
|
|
6
|
+
*
|
|
7
|
+
* @param cli - The main Commander instance (must have name 'crawlith').
|
|
8
|
+
* @param pluginName - The unique name of the plugin (e.g., 'pagespeed').
|
|
9
|
+
* @param credentialLabel - Human-readable label for the credential (e.g., 'Google API Key').
|
|
10
|
+
*/
|
|
11
|
+
export function registerPluginConfigCommand(cli, pluginName, credentialLabel) {
|
|
12
|
+
// Only register subcommands if we are in the root 'crawlith' CLI context
|
|
13
|
+
if (cli.name() !== 'crawlith')
|
|
14
|
+
return;
|
|
15
|
+
// Find or create 'config' command
|
|
16
|
+
let configCmd = cli.commands.find(c => c.name() === 'config');
|
|
17
|
+
if (!configCmd) {
|
|
18
|
+
configCmd = new Command('config').description('Manage Crawlith plugin configuration');
|
|
19
|
+
cli.addCommand(configCmd);
|
|
20
|
+
}
|
|
21
|
+
// Define plugin-specific subcommand
|
|
22
|
+
const pluginConfigCmd = new Command(pluginName).description(`Manage ${pluginName} configuration`);
|
|
23
|
+
pluginConfigCmd
|
|
24
|
+
.command('set <value>')
|
|
25
|
+
.description(`Set and encrypt ${credentialLabel}`)
|
|
26
|
+
.action((value) => {
|
|
27
|
+
setEncryptedConfigKey(pluginName, value);
|
|
28
|
+
console.log(`✅ ${credentialLabel} for ${pluginName} saved and encrypted.`);
|
|
29
|
+
});
|
|
30
|
+
configCmd.addCommand(pluginConfigCmd);
|
|
31
|
+
}
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
export declare class PluginConfig {
|
|
2
|
+
private pluginName;
|
|
3
|
+
constructor(pluginName: string);
|
|
4
|
+
/**
|
|
5
|
+
* Get a decrypted config key for the current plugin.
|
|
6
|
+
*/
|
|
7
|
+
get(keyName?: string): string;
|
|
8
|
+
/**
|
|
9
|
+
* Get a decrypted config key, or throw a user-friendly error if it's missing.
|
|
10
|
+
*/
|
|
11
|
+
require(keyName?: string): string;
|
|
12
|
+
/**
|
|
13
|
+
* Set/Encrypt a config key for the current plugin.
|
|
14
|
+
*/
|
|
15
|
+
set(value: string): void;
|
|
16
|
+
}
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
import { getDecryptedConfigKey, setEncryptedConfigKey } from '../utils/secureConfig.js';
|
|
2
|
+
export class PluginConfig {
|
|
3
|
+
pluginName;
|
|
4
|
+
constructor(pluginName) {
|
|
5
|
+
this.pluginName = pluginName;
|
|
6
|
+
}
|
|
7
|
+
/**
|
|
8
|
+
* Get a decrypted config key for the current plugin.
|
|
9
|
+
*/
|
|
10
|
+
get(keyName) {
|
|
11
|
+
const section = keyName || this.pluginName;
|
|
12
|
+
// Safety check: ensure plugins can only access their own config section
|
|
13
|
+
if (section !== this.pluginName) {
|
|
14
|
+
throw new Error(`Security Violation: Plugin "${this.pluginName}" attempted to access config for "${section}"`);
|
|
15
|
+
}
|
|
16
|
+
return getDecryptedConfigKey(section);
|
|
17
|
+
}
|
|
18
|
+
/**
|
|
19
|
+
* Get a decrypted config key, or throw a user-friendly error if it's missing.
|
|
20
|
+
*/
|
|
21
|
+
require(keyName) {
|
|
22
|
+
try {
|
|
23
|
+
return this.get(keyName);
|
|
24
|
+
}
|
|
25
|
+
catch (_error) {
|
|
26
|
+
const section = keyName || this.pluginName;
|
|
27
|
+
throw new Error(`Missing ${section} configuration. Please run: crawlith config ${section} set <value>`, { cause: _error });
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
/**
|
|
31
|
+
* Set/Encrypt a config key for the current plugin.
|
|
32
|
+
*/
|
|
33
|
+
set(value) {
|
|
34
|
+
setEncryptedConfigKey(this.pluginName, value);
|
|
35
|
+
}
|
|
36
|
+
}
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
import { CrawlithPlugin } from './plugin-types.js';
|
|
2
|
+
export interface PluginLoaderLogger {
|
|
3
|
+
debug(msg: string): void;
|
|
4
|
+
info?(msg: string): void;
|
|
5
|
+
warn?(msg: string): void;
|
|
6
|
+
error?(msg: string): void;
|
|
7
|
+
}
|
|
8
|
+
export declare class PluginLoader {
|
|
9
|
+
private plugins;
|
|
10
|
+
private logger?;
|
|
11
|
+
constructor(logger?: PluginLoaderLogger);
|
|
12
|
+
discover(rootPath: string): Promise<CrawlithPlugin[]>;
|
|
13
|
+
private loadFromDir;
|
|
14
|
+
private loadFromNodeModules;
|
|
15
|
+
private tryLoadPlugin;
|
|
16
|
+
private validatePlugin;
|
|
17
|
+
}
|