@crawlith/core 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +201 -0
- package/README.md +70 -0
- package/dist/analysis/analyze.d.ts +29 -8
- package/dist/analysis/analyze.js +325 -221
- package/dist/analysis/clustering.d.ts +23 -0
- package/dist/analysis/clustering.js +206 -0
- package/dist/analysis/content.d.ts +1 -1
- package/dist/analysis/content.js +11 -5
- package/dist/analysis/duplicate.d.ts +34 -0
- package/dist/analysis/duplicate.js +305 -0
- package/dist/analysis/heading.d.ts +116 -0
- package/dist/analysis/heading.js +356 -0
- package/dist/analysis/images.d.ts +1 -1
- package/dist/analysis/images.js +6 -5
- package/dist/analysis/links.d.ts +1 -1
- package/dist/analysis/links.js +8 -8
- package/dist/{scoring/orphanSeverity.d.ts → analysis/orphan.d.ts} +12 -23
- package/dist/{scoring/orphanSeverity.js → analysis/orphan.js} +9 -3
- package/dist/analysis/scoring.js +4 -1
- package/dist/analysis/seo.d.ts +8 -4
- package/dist/analysis/seo.js +41 -30
- package/dist/analysis/soft404.d.ts +17 -0
- package/dist/analysis/soft404.js +62 -0
- package/dist/analysis/structuredData.d.ts +1 -1
- package/dist/analysis/structuredData.js +5 -4
- package/dist/application/index.d.ts +2 -0
- package/dist/application/index.js +2 -0
- package/dist/application/usecase.d.ts +3 -0
- package/dist/application/usecase.js +1 -0
- package/dist/application/usecases.d.ts +114 -0
- package/dist/application/usecases.js +201 -0
- package/dist/audit/index.js +1 -1
- package/dist/audit/transport.d.ts +1 -1
- package/dist/audit/transport.js +5 -4
- package/dist/audit/types.d.ts +1 -0
- package/dist/constants.d.ts +17 -0
- package/dist/constants.js +23 -0
- package/dist/core/scope/scopeManager.js +3 -0
- package/dist/crawler/crawl.d.ts +2 -2
- package/dist/crawler/crawler.d.ts +17 -5
- package/dist/crawler/crawler.js +259 -94
- package/dist/crawler/fetcher.d.ts +1 -1
- package/dist/crawler/fetcher.js +6 -6
- package/dist/crawler/metricsRunner.d.ts +21 -1
- package/dist/crawler/metricsRunner.js +181 -60
- package/dist/crawler/normalize.d.ts +41 -0
- package/dist/crawler/normalize.js +119 -3
- package/dist/crawler/parser.d.ts +1 -3
- package/dist/crawler/parser.js +2 -49
- package/dist/crawler/resolver.d.ts +11 -0
- package/dist/crawler/resolver.js +67 -0
- package/dist/crawler/sitemap.d.ts +4 -1
- package/dist/crawler/sitemap.js +24 -18
- package/dist/crawler/trap.d.ts +5 -1
- package/dist/crawler/trap.js +23 -2
- package/dist/db/CrawlithDB.d.ts +110 -0
- package/dist/db/CrawlithDB.js +500 -0
- package/dist/db/graphLoader.js +15 -32
- package/dist/db/index.d.ts +9 -1
- package/dist/db/index.js +39 -31
- package/dist/db/migrations.d.ts +2 -0
- package/dist/db/{schema.js → migrations.js} +90 -43
- package/dist/db/pluginRegistry.d.ts +9 -0
- package/dist/db/pluginRegistry.js +19 -0
- package/dist/db/repositories/EdgeRepository.d.ts +5 -0
- package/dist/db/repositories/EdgeRepository.js +7 -0
- package/dist/db/repositories/MetricsRepository.d.ts +13 -8
- package/dist/db/repositories/MetricsRepository.js +14 -6
- package/dist/db/repositories/PageRepository.d.ts +5 -3
- package/dist/db/repositories/PageRepository.js +68 -17
- package/dist/db/repositories/SiteRepository.d.ts +6 -0
- package/dist/db/repositories/SiteRepository.js +4 -0
- package/dist/db/repositories/SnapshotRepository.d.ts +12 -5
- package/dist/db/repositories/SnapshotRepository.js +48 -10
- package/dist/db/reset.d.ts +9 -0
- package/dist/db/reset.js +32 -0
- package/dist/db/statements.d.ts +12 -0
- package/dist/db/statements.js +40 -0
- package/dist/diff/compare.d.ts +0 -5
- package/dist/diff/compare.js +0 -12
- package/dist/diff/service.d.ts +16 -0
- package/dist/diff/service.js +41 -0
- package/dist/domain/index.d.ts +4 -0
- package/dist/domain/index.js +4 -0
- package/dist/events.d.ts +8 -0
- package/dist/graph/graph.d.ts +20 -42
- package/dist/graph/graph.js +12 -16
- package/dist/graph/hits.d.ts +23 -0
- package/dist/graph/hits.js +111 -0
- package/dist/graph/metrics.d.ts +0 -4
- package/dist/graph/metrics.js +19 -15
- package/dist/graph/pagerank.d.ts +17 -4
- package/dist/graph/pagerank.js +126 -93
- package/dist/index.d.ts +27 -9
- package/dist/index.js +27 -9
- package/dist/lock/lockManager.d.ts +1 -0
- package/dist/lock/lockManager.js +15 -0
- package/dist/plugin-system/plugin-cli.d.ts +10 -0
- package/dist/plugin-system/plugin-cli.js +31 -0
- package/dist/plugin-system/plugin-config.d.ts +16 -0
- package/dist/plugin-system/plugin-config.js +36 -0
- package/dist/plugin-system/plugin-loader.d.ts +17 -0
- package/dist/plugin-system/plugin-loader.js +122 -0
- package/dist/plugin-system/plugin-registry.d.ts +25 -0
- package/dist/plugin-system/plugin-registry.js +167 -0
- package/dist/plugin-system/plugin-types.d.ts +205 -0
- package/dist/plugin-system/plugin-types.js +1 -0
- package/dist/ports/index.d.ts +9 -0
- package/dist/ports/index.js +1 -0
- package/dist/report/export.d.ts +3 -0
- package/dist/report/export.js +81 -0
- package/dist/report/insight.d.ts +27 -0
- package/dist/report/insight.js +103 -0
- package/dist/scoring/health.d.ts +17 -11
- package/dist/scoring/health.js +183 -140
- package/dist/utils/chalk.d.ts +6 -0
- package/dist/utils/chalk.js +41 -0
- package/dist/utils/secureConfig.d.ts +23 -0
- package/dist/utils/secureConfig.js +128 -0
- package/package.json +10 -4
- package/CHANGELOG.md +0 -13
- package/dist/db/schema.d.ts +0 -2
- package/dist/graph/cluster.d.ts +0 -6
- package/dist/graph/cluster.js +0 -221
- package/dist/graph/duplicate.d.ts +0 -10
- package/dist/graph/duplicate.js +0 -302
- package/dist/scoring/hits.d.ts +0 -10
- package/dist/scoring/hits.js +0 -131
- package/scripts/copy-assets.js +0 -37
- package/src/analysis/analysis_list.html +0 -35
- package/src/analysis/analysis_page.html +0 -123
- package/src/analysis/analyze.ts +0 -505
- package/src/analysis/content.ts +0 -62
- package/src/analysis/images.ts +0 -28
- package/src/analysis/links.ts +0 -41
- package/src/analysis/scoring.ts +0 -66
- package/src/analysis/seo.ts +0 -82
- package/src/analysis/structuredData.ts +0 -62
- package/src/analysis/templates.ts +0 -9
- package/src/audit/dns.ts +0 -49
- package/src/audit/headers.ts +0 -98
- package/src/audit/index.ts +0 -66
- package/src/audit/scoring.ts +0 -232
- package/src/audit/transport.ts +0 -258
- package/src/audit/types.ts +0 -102
- package/src/core/network/proxyAdapter.ts +0 -21
- package/src/core/network/rateLimiter.ts +0 -39
- package/src/core/network/redirectController.ts +0 -47
- package/src/core/network/responseLimiter.ts +0 -34
- package/src/core/network/retryPolicy.ts +0 -57
- package/src/core/scope/domainFilter.ts +0 -45
- package/src/core/scope/scopeManager.ts +0 -52
- package/src/core/scope/subdomainPolicy.ts +0 -39
- package/src/core/security/ipGuard.ts +0 -171
- package/src/crawler/crawl.ts +0 -9
- package/src/crawler/crawler.ts +0 -601
- package/src/crawler/extract.ts +0 -39
- package/src/crawler/fetcher.ts +0 -251
- package/src/crawler/metricsRunner.ts +0 -137
- package/src/crawler/normalize.ts +0 -108
- package/src/crawler/parser.ts +0 -190
- package/src/crawler/sitemap.ts +0 -76
- package/src/crawler/trap.ts +0 -96
- package/src/db/graphLoader.ts +0 -135
- package/src/db/index.ts +0 -75
- package/src/db/repositories/EdgeRepository.ts +0 -43
- package/src/db/repositories/MetricsRepository.ts +0 -63
- package/src/db/repositories/PageRepository.ts +0 -228
- package/src/db/repositories/SiteRepository.ts +0 -43
- package/src/db/repositories/SnapshotRepository.ts +0 -99
- package/src/db/schema.ts +0 -177
- package/src/diff/compare.ts +0 -84
- package/src/events.ts +0 -16
- package/src/graph/cluster.ts +0 -246
- package/src/graph/duplicate.ts +0 -350
- package/src/graph/graph.ts +0 -192
- package/src/graph/metrics.ts +0 -125
- package/src/graph/pagerank.ts +0 -126
- package/src/graph/simhash.ts +0 -76
- package/src/index.ts +0 -33
- package/src/lock/hashKey.ts +0 -51
- package/src/lock/lockManager.ts +0 -132
- package/src/lock/pidCheck.ts +0 -13
- package/src/report/crawl.html +0 -879
- package/src/report/crawlExport.ts +0 -58
- package/src/report/crawl_template.ts +0 -9
- package/src/report/html.ts +0 -27
- package/src/scoring/health.ts +0 -241
- package/src/scoring/hits.ts +0 -153
- package/src/scoring/orphanSeverity.ts +0 -176
- package/src/utils/version.ts +0 -18
- package/tests/__snapshots__/orphanSeverity.test.ts.snap +0 -49
- package/tests/analysis.unit.test.ts +0 -142
- package/tests/analyze.integration.test.ts +0 -133
- package/tests/analyze_markdown.test.ts +0 -98
- package/tests/audit/audit.test.ts +0 -101
- package/tests/audit/dns.test.ts +0 -31
- package/tests/audit/headers.test.ts +0 -45
- package/tests/audit/scoring.test.ts +0 -133
- package/tests/audit/security.test.ts +0 -12
- package/tests/audit/transport.test.ts +0 -111
- package/tests/clustering.test.ts +0 -118
- package/tests/clustering_risk.test.ts +0 -118
- package/tests/crawler.test.ts +0 -364
- package/tests/db/index.test.ts +0 -134
- package/tests/db/repositories.test.ts +0 -115
- package/tests/db.test.ts +0 -159
- package/tests/db_repos.test.ts +0 -72
- package/tests/diff.test.ts +0 -67
- package/tests/duplicate.test.ts +0 -110
- package/tests/extract.test.ts +0 -86
- package/tests/fetcher.test.ts +0 -110
- package/tests/fetcher_safety.test.ts +0 -91
- package/tests/fixtures/analyze-crawl.json +0 -26
- package/tests/graph/graph.test.ts +0 -100
- package/tests/graphLoader.test.ts +0 -124
- package/tests/hits.test.ts +0 -134
- package/tests/html_report.test.ts +0 -59
- package/tests/ipGuard.test.ts +0 -73
- package/tests/lock/lockManager.test.ts +0 -198
- package/tests/metrics.test.ts +0 -196
- package/tests/normalize.test.ts +0 -88
- package/tests/orphanSeverity.test.ts +0 -160
- package/tests/pagerank.test.ts +0 -98
- package/tests/parser.test.ts +0 -117
- package/tests/proxy_safety.test.ts +0 -57
- package/tests/redirect_safety.test.ts +0 -77
- package/tests/renderAnalysisCsv.test.ts +0 -183
- package/tests/safety.test.ts +0 -126
- package/tests/scope.test.ts +0 -84
- package/tests/scoring.test.ts +0 -60
- package/tests/sitemap.test.ts +0 -100
- package/tests/soft404.test.ts +0 -41
- package/tests/ssrf_fix.test.ts +0 -69
- package/tests/trap.test.ts +0 -39
- package/tests/visualization_data.test.ts +0 -46
- package/tsconfig.json +0 -11
|
@@ -1,171 +0,0 @@
|
|
|
1
|
-
import * as dns from 'dns';
|
|
2
|
-
import * as net from 'net';
|
|
3
|
-
import { promisify } from 'util';
|
|
4
|
-
import { Agent } from 'undici';
|
|
5
|
-
|
|
6
|
-
const resolve4 = promisify(dns.resolve4);
|
|
7
|
-
const resolve6 = promisify(dns.resolve6);
|
|
8
|
-
|
|
9
|
-
export class IPGuard {
|
|
10
|
-
/**
|
|
11
|
-
* Checks if an IP address is internal/private
|
|
12
|
-
*/
|
|
13
|
-
static isInternal(ip: string): boolean {
|
|
14
|
-
if (net.isIPv4(ip)) {
|
|
15
|
-
const parts = ip.split('.').map(Number);
|
|
16
|
-
|
|
17
|
-
// 127.0.0.0/8
|
|
18
|
-
if (parts[0] === 127) return true;
|
|
19
|
-
|
|
20
|
-
// 10.0.0.0/8
|
|
21
|
-
if (parts[0] === 10) return true;
|
|
22
|
-
|
|
23
|
-
// 192.168.0.0/16
|
|
24
|
-
if (parts[0] === 192 && parts[1] === 168) return true;
|
|
25
|
-
|
|
26
|
-
// 172.16.0.0 – 172.31.255.255
|
|
27
|
-
if (parts[0] === 172 && parts[1] >= 16 && parts[1] <= 31) return true;
|
|
28
|
-
|
|
29
|
-
// 169.254.0.0/16
|
|
30
|
-
if (parts[0] === 169 && parts[1] === 254) return true;
|
|
31
|
-
|
|
32
|
-
// 0.0.0.0/8
|
|
33
|
-
if (parts[0] === 0) return true;
|
|
34
|
-
|
|
35
|
-
return false;
|
|
36
|
-
}
|
|
37
|
-
|
|
38
|
-
if (net.isIPv6(ip)) {
|
|
39
|
-
// Normalize IPv6
|
|
40
|
-
const expanded = IPGuard.expandIPv6(ip);
|
|
41
|
-
|
|
42
|
-
// ::1
|
|
43
|
-
if (expanded === '0000:0000:0000:0000:0000:0000:0000:0001') return true;
|
|
44
|
-
|
|
45
|
-
// fc00::/7 (Unique Local Address) -> fc or fd
|
|
46
|
-
const firstWord = parseInt(expanded.split(':')[0], 16);
|
|
47
|
-
if ((firstWord & 0xfe00) === 0xfc00) return true;
|
|
48
|
-
|
|
49
|
-
// fe80::/10 (Link Local)
|
|
50
|
-
if ((firstWord & 0xffc0) === 0xfe80) return true;
|
|
51
|
-
|
|
52
|
-
// IPv4-mapped IPv6: ::ffff:0:0/96
|
|
53
|
-
if (expanded.startsWith('0000:0000:0000:0000:0000:ffff:')) {
|
|
54
|
-
const parts = expanded.split(':');
|
|
55
|
-
const p7 = parseInt(parts[6], 16);
|
|
56
|
-
const p8 = parseInt(parts[7], 16);
|
|
57
|
-
const ip4 = `${(p7 >> 8) & 255}.${p7 & 255}.${(p8 >> 8) & 255}.${p8 & 255}`;
|
|
58
|
-
return IPGuard.isInternal(ip4);
|
|
59
|
-
}
|
|
60
|
-
|
|
61
|
-
return false;
|
|
62
|
-
}
|
|
63
|
-
|
|
64
|
-
return true; // Unknown format, block it for safety
|
|
65
|
-
}
|
|
66
|
-
|
|
67
|
-
/**
|
|
68
|
-
* Resolves a hostname and validates all result IPs
|
|
69
|
-
*/
|
|
70
|
-
static async validateHost(host: string): Promise<boolean> {
|
|
71
|
-
if (net.isIP(host)) {
|
|
72
|
-
return !IPGuard.isInternal(host);
|
|
73
|
-
}
|
|
74
|
-
|
|
75
|
-
try {
|
|
76
|
-
const res4 = await resolve4(host).catch(() => [] as string[]);
|
|
77
|
-
const res6 = await resolve6(host).catch(() => [] as string[]);
|
|
78
|
-
const ips = [...res4, ...res6];
|
|
79
|
-
|
|
80
|
-
if (ips.length === 0) return true; // Let the fetcher handle DNS failures
|
|
81
|
-
|
|
82
|
-
return ips.every(ip => !IPGuard.isInternal(ip));
|
|
83
|
-
} catch (_e) {
|
|
84
|
-
// If resolution fails drastically, we block for safety or let fetcher try
|
|
85
|
-
return false;
|
|
86
|
-
}
|
|
87
|
-
}
|
|
88
|
-
|
|
89
|
-
/**
|
|
90
|
-
* Custom lookup function for undici that validates the resolved IP.
|
|
91
|
-
* Prevents DNS Rebinding attacks by checking the IP immediately before connection.
|
|
92
|
-
*/
|
|
93
|
-
static secureLookup(
|
|
94
|
-
hostname: string,
|
|
95
|
-
options: dns.LookupOneOptions | dns.LookupAllOptions,
|
|
96
|
-
callback: (err: NodeJS.ErrnoException | null, address: string | dns.LookupAddress[], family: number) => void
|
|
97
|
-
): void {
|
|
98
|
-
dns.lookup(hostname, options as any, (err: NodeJS.ErrnoException | null, address: string | dns.LookupAddress[], family: number) => {
|
|
99
|
-
if (err) {
|
|
100
|
-
return callback(err, address as any, family);
|
|
101
|
-
}
|
|
102
|
-
|
|
103
|
-
const checkIP = (ip: string) => {
|
|
104
|
-
if (IPGuard.isInternal(ip)) {
|
|
105
|
-
return new Error(`Blocked internal IP: ${ip}`);
|
|
106
|
-
}
|
|
107
|
-
return null;
|
|
108
|
-
};
|
|
109
|
-
|
|
110
|
-
if (typeof address === 'string') {
|
|
111
|
-
const error = checkIP(address);
|
|
112
|
-
if (error) {
|
|
113
|
-
// Return a custom error that undici will propagate
|
|
114
|
-
const blockedError = new Error(`Blocked internal IP: ${address}`);
|
|
115
|
-
(blockedError as any).code = 'EBLOCKED';
|
|
116
|
-
return callback(blockedError, address, family);
|
|
117
|
-
}
|
|
118
|
-
} else if (Array.isArray(address)) {
|
|
119
|
-
// Handle array of addresses (if options.all is true)
|
|
120
|
-
for (const addr of address) {
|
|
121
|
-
const error = checkIP(addr.address);
|
|
122
|
-
if (error) {
|
|
123
|
-
const blockedError = new Error(`Blocked internal IP: ${addr.address}`);
|
|
124
|
-
(blockedError as any).code = 'EBLOCKED';
|
|
125
|
-
return callback(blockedError, address, family);
|
|
126
|
-
}
|
|
127
|
-
}
|
|
128
|
-
}
|
|
129
|
-
|
|
130
|
-
callback(null, address, family);
|
|
131
|
-
});
|
|
132
|
-
}
|
|
133
|
-
|
|
134
|
-
/**
|
|
135
|
-
* Returns an undici Agent configured with secure DNS lookup.
|
|
136
|
-
*/
|
|
137
|
-
static getSecureDispatcher(): Agent {
|
|
138
|
-
return new Agent({
|
|
139
|
-
connect: {
|
|
140
|
-
lookup: IPGuard.secureLookup as any
|
|
141
|
-
}
|
|
142
|
-
});
|
|
143
|
-
}
|
|
144
|
-
|
|
145
|
-
private static expandIPv6(ip: string): string {
|
|
146
|
-
if (ip === '::') return '0000:0000:0000:0000:0000:0000:0000:0000';
|
|
147
|
-
|
|
148
|
-
let normalizedIp = ip;
|
|
149
|
-
if (ip.includes('.')) {
|
|
150
|
-
const lastColonIndex = ip.lastIndexOf(':');
|
|
151
|
-
const lastPart = ip.substring(lastColonIndex + 1);
|
|
152
|
-
if (net.isIPv4(lastPart)) {
|
|
153
|
-
const parts = lastPart.split('.').map(Number);
|
|
154
|
-
const hex1 = ((parts[0] << 8) | parts[1]).toString(16);
|
|
155
|
-
const hex2 = ((parts[2] << 8) | parts[3]).toString(16);
|
|
156
|
-
normalizedIp = ip.substring(0, lastColonIndex + 1) + hex1 + ':' + hex2;
|
|
157
|
-
}
|
|
158
|
-
}
|
|
159
|
-
|
|
160
|
-
let full = normalizedIp;
|
|
161
|
-
if (normalizedIp.includes('::')) {
|
|
162
|
-
const parts = normalizedIp.split('::');
|
|
163
|
-
const left = parts[0].split(':').filter(x => x !== '');
|
|
164
|
-
const right = parts[1].split(':').filter(x => x !== '');
|
|
165
|
-
const missing = 8 - (left.length + right.length);
|
|
166
|
-
const middle = Array(missing).fill('0000');
|
|
167
|
-
full = [...left, ...middle, ...right].join(':');
|
|
168
|
-
}
|
|
169
|
-
return full.split(':').map(part => part.padStart(4, '0')).join(':');
|
|
170
|
-
}
|
|
171
|
-
}
|
package/src/crawler/crawl.ts
DELETED
|
@@ -1,9 +0,0 @@
|
|
|
1
|
-
import { Crawler, CrawlOptions } from './crawler.js';
|
|
2
|
-
import { EngineContext } from '../events.js';
|
|
3
|
-
|
|
4
|
-
export { CrawlOptions };
|
|
5
|
-
|
|
6
|
-
export async function crawl(startUrl: string, options: CrawlOptions, context?: EngineContext): Promise<number> {
|
|
7
|
-
const crawler = new Crawler(startUrl, options, context);
|
|
8
|
-
return crawler.run();
|
|
9
|
-
}
|