@crawlith/core 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +201 -0
- package/README.md +70 -0
- package/dist/analysis/analyze.d.ts +29 -8
- package/dist/analysis/analyze.js +325 -221
- package/dist/analysis/clustering.d.ts +23 -0
- package/dist/analysis/clustering.js +206 -0
- package/dist/analysis/content.d.ts +1 -1
- package/dist/analysis/content.js +11 -5
- package/dist/analysis/duplicate.d.ts +34 -0
- package/dist/analysis/duplicate.js +305 -0
- package/dist/analysis/heading.d.ts +116 -0
- package/dist/analysis/heading.js +356 -0
- package/dist/analysis/images.d.ts +1 -1
- package/dist/analysis/images.js +6 -5
- package/dist/analysis/links.d.ts +1 -1
- package/dist/analysis/links.js +8 -8
- package/dist/{scoring/orphanSeverity.d.ts → analysis/orphan.d.ts} +12 -23
- package/dist/{scoring/orphanSeverity.js → analysis/orphan.js} +9 -3
- package/dist/analysis/scoring.js +4 -1
- package/dist/analysis/seo.d.ts +8 -4
- package/dist/analysis/seo.js +41 -30
- package/dist/analysis/soft404.d.ts +17 -0
- package/dist/analysis/soft404.js +62 -0
- package/dist/analysis/structuredData.d.ts +1 -1
- package/dist/analysis/structuredData.js +5 -4
- package/dist/application/index.d.ts +2 -0
- package/dist/application/index.js +2 -0
- package/dist/application/usecase.d.ts +3 -0
- package/dist/application/usecase.js +1 -0
- package/dist/application/usecases.d.ts +114 -0
- package/dist/application/usecases.js +201 -0
- package/dist/audit/index.js +1 -1
- package/dist/audit/transport.d.ts +1 -1
- package/dist/audit/transport.js +5 -4
- package/dist/audit/types.d.ts +1 -0
- package/dist/constants.d.ts +17 -0
- package/dist/constants.js +23 -0
- package/dist/core/scope/scopeManager.js +3 -0
- package/dist/crawler/crawl.d.ts +2 -2
- package/dist/crawler/crawler.d.ts +17 -5
- package/dist/crawler/crawler.js +259 -94
- package/dist/crawler/fetcher.d.ts +1 -1
- package/dist/crawler/fetcher.js +6 -6
- package/dist/crawler/metricsRunner.d.ts +21 -1
- package/dist/crawler/metricsRunner.js +181 -60
- package/dist/crawler/normalize.d.ts +41 -0
- package/dist/crawler/normalize.js +119 -3
- package/dist/crawler/parser.d.ts +1 -3
- package/dist/crawler/parser.js +2 -49
- package/dist/crawler/resolver.d.ts +11 -0
- package/dist/crawler/resolver.js +67 -0
- package/dist/crawler/sitemap.d.ts +4 -1
- package/dist/crawler/sitemap.js +24 -18
- package/dist/crawler/trap.d.ts +5 -1
- package/dist/crawler/trap.js +23 -2
- package/dist/db/CrawlithDB.d.ts +110 -0
- package/dist/db/CrawlithDB.js +500 -0
- package/dist/db/graphLoader.js +15 -32
- package/dist/db/index.d.ts +9 -1
- package/dist/db/index.js +39 -31
- package/dist/db/migrations.d.ts +2 -0
- package/dist/db/{schema.js → migrations.js} +90 -43
- package/dist/db/pluginRegistry.d.ts +9 -0
- package/dist/db/pluginRegistry.js +19 -0
- package/dist/db/repositories/EdgeRepository.d.ts +5 -0
- package/dist/db/repositories/EdgeRepository.js +7 -0
- package/dist/db/repositories/MetricsRepository.d.ts +13 -8
- package/dist/db/repositories/MetricsRepository.js +14 -6
- package/dist/db/repositories/PageRepository.d.ts +5 -3
- package/dist/db/repositories/PageRepository.js +68 -17
- package/dist/db/repositories/SiteRepository.d.ts +6 -0
- package/dist/db/repositories/SiteRepository.js +4 -0
- package/dist/db/repositories/SnapshotRepository.d.ts +12 -5
- package/dist/db/repositories/SnapshotRepository.js +48 -10
- package/dist/db/reset.d.ts +9 -0
- package/dist/db/reset.js +32 -0
- package/dist/db/statements.d.ts +12 -0
- package/dist/db/statements.js +40 -0
- package/dist/diff/compare.d.ts +0 -5
- package/dist/diff/compare.js +0 -12
- package/dist/diff/service.d.ts +16 -0
- package/dist/diff/service.js +41 -0
- package/dist/domain/index.d.ts +4 -0
- package/dist/domain/index.js +4 -0
- package/dist/events.d.ts +8 -0
- package/dist/graph/graph.d.ts +20 -42
- package/dist/graph/graph.js +12 -16
- package/dist/graph/hits.d.ts +23 -0
- package/dist/graph/hits.js +111 -0
- package/dist/graph/metrics.d.ts +0 -4
- package/dist/graph/metrics.js +19 -15
- package/dist/graph/pagerank.d.ts +17 -4
- package/dist/graph/pagerank.js +126 -93
- package/dist/index.d.ts +27 -9
- package/dist/index.js +27 -9
- package/dist/lock/lockManager.d.ts +1 -0
- package/dist/lock/lockManager.js +15 -0
- package/dist/plugin-system/plugin-cli.d.ts +10 -0
- package/dist/plugin-system/plugin-cli.js +31 -0
- package/dist/plugin-system/plugin-config.d.ts +16 -0
- package/dist/plugin-system/plugin-config.js +36 -0
- package/dist/plugin-system/plugin-loader.d.ts +17 -0
- package/dist/plugin-system/plugin-loader.js +122 -0
- package/dist/plugin-system/plugin-registry.d.ts +25 -0
- package/dist/plugin-system/plugin-registry.js +167 -0
- package/dist/plugin-system/plugin-types.d.ts +205 -0
- package/dist/plugin-system/plugin-types.js +1 -0
- package/dist/ports/index.d.ts +9 -0
- package/dist/ports/index.js +1 -0
- package/dist/report/export.d.ts +3 -0
- package/dist/report/export.js +81 -0
- package/dist/report/insight.d.ts +27 -0
- package/dist/report/insight.js +103 -0
- package/dist/scoring/health.d.ts +17 -11
- package/dist/scoring/health.js +183 -140
- package/dist/utils/chalk.d.ts +6 -0
- package/dist/utils/chalk.js +41 -0
- package/dist/utils/secureConfig.d.ts +23 -0
- package/dist/utils/secureConfig.js +128 -0
- package/package.json +10 -4
- package/CHANGELOG.md +0 -13
- package/dist/db/schema.d.ts +0 -2
- package/dist/graph/cluster.d.ts +0 -6
- package/dist/graph/cluster.js +0 -221
- package/dist/graph/duplicate.d.ts +0 -10
- package/dist/graph/duplicate.js +0 -302
- package/dist/scoring/hits.d.ts +0 -10
- package/dist/scoring/hits.js +0 -131
- package/scripts/copy-assets.js +0 -37
- package/src/analysis/analysis_list.html +0 -35
- package/src/analysis/analysis_page.html +0 -123
- package/src/analysis/analyze.ts +0 -505
- package/src/analysis/content.ts +0 -62
- package/src/analysis/images.ts +0 -28
- package/src/analysis/links.ts +0 -41
- package/src/analysis/scoring.ts +0 -66
- package/src/analysis/seo.ts +0 -82
- package/src/analysis/structuredData.ts +0 -62
- package/src/analysis/templates.ts +0 -9
- package/src/audit/dns.ts +0 -49
- package/src/audit/headers.ts +0 -98
- package/src/audit/index.ts +0 -66
- package/src/audit/scoring.ts +0 -232
- package/src/audit/transport.ts +0 -258
- package/src/audit/types.ts +0 -102
- package/src/core/network/proxyAdapter.ts +0 -21
- package/src/core/network/rateLimiter.ts +0 -39
- package/src/core/network/redirectController.ts +0 -47
- package/src/core/network/responseLimiter.ts +0 -34
- package/src/core/network/retryPolicy.ts +0 -57
- package/src/core/scope/domainFilter.ts +0 -45
- package/src/core/scope/scopeManager.ts +0 -52
- package/src/core/scope/subdomainPolicy.ts +0 -39
- package/src/core/security/ipGuard.ts +0 -171
- package/src/crawler/crawl.ts +0 -9
- package/src/crawler/crawler.ts +0 -601
- package/src/crawler/extract.ts +0 -39
- package/src/crawler/fetcher.ts +0 -251
- package/src/crawler/metricsRunner.ts +0 -137
- package/src/crawler/normalize.ts +0 -108
- package/src/crawler/parser.ts +0 -190
- package/src/crawler/sitemap.ts +0 -76
- package/src/crawler/trap.ts +0 -96
- package/src/db/graphLoader.ts +0 -135
- package/src/db/index.ts +0 -75
- package/src/db/repositories/EdgeRepository.ts +0 -43
- package/src/db/repositories/MetricsRepository.ts +0 -63
- package/src/db/repositories/PageRepository.ts +0 -228
- package/src/db/repositories/SiteRepository.ts +0 -43
- package/src/db/repositories/SnapshotRepository.ts +0 -99
- package/src/db/schema.ts +0 -177
- package/src/diff/compare.ts +0 -84
- package/src/events.ts +0 -16
- package/src/graph/cluster.ts +0 -246
- package/src/graph/duplicate.ts +0 -350
- package/src/graph/graph.ts +0 -192
- package/src/graph/metrics.ts +0 -125
- package/src/graph/pagerank.ts +0 -126
- package/src/graph/simhash.ts +0 -76
- package/src/index.ts +0 -33
- package/src/lock/hashKey.ts +0 -51
- package/src/lock/lockManager.ts +0 -132
- package/src/lock/pidCheck.ts +0 -13
- package/src/report/crawl.html +0 -879
- package/src/report/crawlExport.ts +0 -58
- package/src/report/crawl_template.ts +0 -9
- package/src/report/html.ts +0 -27
- package/src/scoring/health.ts +0 -241
- package/src/scoring/hits.ts +0 -153
- package/src/scoring/orphanSeverity.ts +0 -176
- package/src/utils/version.ts +0 -18
- package/tests/__snapshots__/orphanSeverity.test.ts.snap +0 -49
- package/tests/analysis.unit.test.ts +0 -142
- package/tests/analyze.integration.test.ts +0 -133
- package/tests/analyze_markdown.test.ts +0 -98
- package/tests/audit/audit.test.ts +0 -101
- package/tests/audit/dns.test.ts +0 -31
- package/tests/audit/headers.test.ts +0 -45
- package/tests/audit/scoring.test.ts +0 -133
- package/tests/audit/security.test.ts +0 -12
- package/tests/audit/transport.test.ts +0 -111
- package/tests/clustering.test.ts +0 -118
- package/tests/clustering_risk.test.ts +0 -118
- package/tests/crawler.test.ts +0 -364
- package/tests/db/index.test.ts +0 -134
- package/tests/db/repositories.test.ts +0 -115
- package/tests/db.test.ts +0 -159
- package/tests/db_repos.test.ts +0 -72
- package/tests/diff.test.ts +0 -67
- package/tests/duplicate.test.ts +0 -110
- package/tests/extract.test.ts +0 -86
- package/tests/fetcher.test.ts +0 -110
- package/tests/fetcher_safety.test.ts +0 -91
- package/tests/fixtures/analyze-crawl.json +0 -26
- package/tests/graph/graph.test.ts +0 -100
- package/tests/graphLoader.test.ts +0 -124
- package/tests/hits.test.ts +0 -134
- package/tests/html_report.test.ts +0 -59
- package/tests/ipGuard.test.ts +0 -73
- package/tests/lock/lockManager.test.ts +0 -198
- package/tests/metrics.test.ts +0 -196
- package/tests/normalize.test.ts +0 -88
- package/tests/orphanSeverity.test.ts +0 -160
- package/tests/pagerank.test.ts +0 -98
- package/tests/parser.test.ts +0 -117
- package/tests/proxy_safety.test.ts +0 -57
- package/tests/redirect_safety.test.ts +0 -77
- package/tests/renderAnalysisCsv.test.ts +0 -183
- package/tests/safety.test.ts +0 -126
- package/tests/scope.test.ts +0 -84
- package/tests/scoring.test.ts +0 -60
- package/tests/sitemap.test.ts +0 -100
- package/tests/soft404.test.ts +0 -41
- package/tests/ssrf_fix.test.ts +0 -69
- package/tests/trap.test.ts +0 -39
- package/tests/visualization_data.test.ts +0 -46
- package/tsconfig.json +0 -11
package/src/crawler/fetcher.ts
DELETED
|
@@ -1,251 +0,0 @@
|
|
|
1
|
-
import { request, Dispatcher } from 'undici';
|
|
2
|
-
import * as net from 'net';
|
|
3
|
-
import { IPGuard } from '../core/security/ipGuard.js';
|
|
4
|
-
import { RateLimiter } from '../core/network/rateLimiter.js';
|
|
5
|
-
import { RetryPolicy } from '../core/network/retryPolicy.js';
|
|
6
|
-
import { ResponseLimiter } from '../core/network/responseLimiter.js';
|
|
7
|
-
import { RedirectController } from '../core/network/redirectController.js';
|
|
8
|
-
import { ProxyAdapter } from '../core/network/proxyAdapter.js';
|
|
9
|
-
import { ScopeManager } from '../core/scope/scopeManager.js';
|
|
10
|
-
import { version } from '../utils/version.js';
|
|
11
|
-
|
|
12
|
-
export interface RedirectStep {
|
|
13
|
-
url: string;
|
|
14
|
-
status: number;
|
|
15
|
-
target: string;
|
|
16
|
-
}
|
|
17
|
-
|
|
18
|
-
export interface FetchResult {
|
|
19
|
-
status: number
|
|
20
|
-
| 'blocked_internal_ip'
|
|
21
|
-
| 'blocked_by_domain_filter'
|
|
22
|
-
| 'blocked_subdomain'
|
|
23
|
-
| 'oversized'
|
|
24
|
-
| 'failed_after_retries'
|
|
25
|
-
| 'network_error'
|
|
26
|
-
| 'redirect_limit_exceeded'
|
|
27
|
-
| 'redirect_loop'
|
|
28
|
-
| 'proxy_connection_failed';
|
|
29
|
-
headers: Record<string, string | string[] | undefined>;
|
|
30
|
-
body: string;
|
|
31
|
-
redirectChain: RedirectStep[];
|
|
32
|
-
etag: string | null;
|
|
33
|
-
lastModified: string | null;
|
|
34
|
-
finalUrl: string;
|
|
35
|
-
retries?: number;
|
|
36
|
-
bytesReceived?: number;
|
|
37
|
-
}
|
|
38
|
-
|
|
39
|
-
export interface FetchOptions {
|
|
40
|
-
etag?: string;
|
|
41
|
-
lastModified?: string;
|
|
42
|
-
rate?: number;
|
|
43
|
-
maxBytes?: number;
|
|
44
|
-
crawlDelay?: number;
|
|
45
|
-
}
|
|
46
|
-
|
|
47
|
-
export class Fetcher {
|
|
48
|
-
private userAgent = 'crawlith/1.0';
|
|
49
|
-
private rateLimiter: RateLimiter;
|
|
50
|
-
private proxyAdapter: ProxyAdapter;
|
|
51
|
-
private secureDispatcher: Dispatcher;
|
|
52
|
-
private scopeManager?: ScopeManager;
|
|
53
|
-
private maxRedirects: number;
|
|
54
|
-
|
|
55
|
-
constructor(options: {
|
|
56
|
-
rate?: number;
|
|
57
|
-
proxyUrl?: string;
|
|
58
|
-
scopeManager?: ScopeManager;
|
|
59
|
-
maxRedirects?: number;
|
|
60
|
-
userAgent?: string;
|
|
61
|
-
} = {}) {
|
|
62
|
-
this.rateLimiter = new RateLimiter(options.rate || 2);
|
|
63
|
-
this.proxyAdapter = new ProxyAdapter(options.proxyUrl);
|
|
64
|
-
|
|
65
|
-
if (this.proxyAdapter.dispatcher) {
|
|
66
|
-
this.secureDispatcher = this.proxyAdapter.dispatcher;
|
|
67
|
-
} else {
|
|
68
|
-
this.secureDispatcher = IPGuard.getSecureDispatcher();
|
|
69
|
-
}
|
|
70
|
-
|
|
71
|
-
this.scopeManager = options.scopeManager;
|
|
72
|
-
this.maxRedirects = Math.min(options.maxRedirects ?? 2, 11);
|
|
73
|
-
this.userAgent = options.userAgent || `crawlith/${version}`;
|
|
74
|
-
}
|
|
75
|
-
|
|
76
|
-
async fetch(url: string, options: FetchOptions = {}): Promise<FetchResult> {
|
|
77
|
-
const maxBytes = options.maxBytes || 2000000;
|
|
78
|
-
const redirectChain: RedirectStep[] = [];
|
|
79
|
-
const redirectController = new RedirectController(this.maxRedirects, url);
|
|
80
|
-
|
|
81
|
-
let currentUrl = url;
|
|
82
|
-
let totalRetries = 0;
|
|
83
|
-
|
|
84
|
-
// Use a while(true) and explicit return/continue to handle redirects
|
|
85
|
-
while (true) {
|
|
86
|
-
const urlObj = new URL(currentUrl);
|
|
87
|
-
|
|
88
|
-
// 1. SSRF Guard (IP Literals only)
|
|
89
|
-
// We only check explicit IP literals here to fail fast.
|
|
90
|
-
// For domains, we rely on the secureDispatcher (which uses IPGuard.secureLookup)
|
|
91
|
-
// to resolve and validate the IP at connection time, preventing TOCTOU attacks.
|
|
92
|
-
if (net.isIP(urlObj.hostname)) {
|
|
93
|
-
if (IPGuard.isInternal(urlObj.hostname)) {
|
|
94
|
-
return this.errorResult('blocked_internal_ip', currentUrl, redirectChain, totalRetries);
|
|
95
|
-
}
|
|
96
|
-
}
|
|
97
|
-
|
|
98
|
-
// 2. Scope Validation (Domain & Subdomain)
|
|
99
|
-
if (this.scopeManager) {
|
|
100
|
-
const eligibility = this.scopeManager.isUrlEligible(currentUrl);
|
|
101
|
-
if (eligibility !== 'allowed') {
|
|
102
|
-
return this.errorResult(eligibility, currentUrl, redirectChain, totalRetries);
|
|
103
|
-
}
|
|
104
|
-
}
|
|
105
|
-
|
|
106
|
-
// 3. Rate Limiting
|
|
107
|
-
await this.rateLimiter.waitForToken(urlObj.hostname, options.crawlDelay);
|
|
108
|
-
|
|
109
|
-
try {
|
|
110
|
-
// 4. Retry Strategy
|
|
111
|
-
const result = await RetryPolicy.execute(
|
|
112
|
-
async (attempt) => {
|
|
113
|
-
if (attempt > 0) totalRetries++;
|
|
114
|
-
|
|
115
|
-
const headers: Record<string, string> = {
|
|
116
|
-
'User-Agent': this.userAgent
|
|
117
|
-
};
|
|
118
|
-
|
|
119
|
-
// Conditional GET only for the FIRST request in a chain
|
|
120
|
-
if (redirectChain.length === 0) {
|
|
121
|
-
if (options.etag) headers['If-None-Match'] = options.etag;
|
|
122
|
-
if (options.lastModified) headers['If-Modified-Since'] = options.lastModified;
|
|
123
|
-
}
|
|
124
|
-
|
|
125
|
-
const res = await request(currentUrl, {
|
|
126
|
-
method: 'GET',
|
|
127
|
-
headers,
|
|
128
|
-
maxRedirections: 0,
|
|
129
|
-
dispatcher: this.secureDispatcher,
|
|
130
|
-
headersTimeout: 10000,
|
|
131
|
-
bodyTimeout: 10000
|
|
132
|
-
});
|
|
133
|
-
|
|
134
|
-
if (RetryPolicy.isRetryableStatus(res.statusCode)) {
|
|
135
|
-
await res.body.dump();
|
|
136
|
-
throw new Error(`Status ${res.statusCode}`);
|
|
137
|
-
}
|
|
138
|
-
|
|
139
|
-
return res;
|
|
140
|
-
},
|
|
141
|
-
(error) => RetryPolicy.isNetworkError(error) || error.message.startsWith('Status ')
|
|
142
|
-
);
|
|
143
|
-
|
|
144
|
-
const status = result.statusCode;
|
|
145
|
-
const resHeaders = result.headers;
|
|
146
|
-
|
|
147
|
-
const getHeader = (name: string): string | null => {
|
|
148
|
-
const val = resHeaders[name.toLowerCase()];
|
|
149
|
-
if (Array.isArray(val)) return val[0];
|
|
150
|
-
return (val as string) || null;
|
|
151
|
-
};
|
|
152
|
-
|
|
153
|
-
const etag = getHeader('etag');
|
|
154
|
-
const lastModified = getHeader('last-modified');
|
|
155
|
-
|
|
156
|
-
// Handle Redirects
|
|
157
|
-
if (status >= 300 && status < 400 && status !== 304) {
|
|
158
|
-
const location = getHeader('location');
|
|
159
|
-
if (location) {
|
|
160
|
-
let targetUrl: string;
|
|
161
|
-
try {
|
|
162
|
-
targetUrl = new URL(location, currentUrl).toString();
|
|
163
|
-
} catch (_e) {
|
|
164
|
-
// Bad redirect location, treat as final but maybe error?
|
|
165
|
-
const body = await ResponseLimiter.streamToString(result.body, maxBytes);
|
|
166
|
-
return { status, headers: resHeaders, body, redirectChain, etag: null, lastModified: null, finalUrl: currentUrl, retries: totalRetries };
|
|
167
|
-
}
|
|
168
|
-
|
|
169
|
-
const redirectError = redirectController.nextHop(targetUrl);
|
|
170
|
-
if (redirectError) {
|
|
171
|
-
await result.body.dump();
|
|
172
|
-
return this.errorResult(redirectError, currentUrl, redirectChain, totalRetries);
|
|
173
|
-
}
|
|
174
|
-
|
|
175
|
-
redirectChain.push({ url: currentUrl, status, target: targetUrl });
|
|
176
|
-
await result.body.dump();
|
|
177
|
-
currentUrl = targetUrl;
|
|
178
|
-
continue; // Next iteration for redirect target
|
|
179
|
-
}
|
|
180
|
-
}
|
|
181
|
-
|
|
182
|
-
// 5. Max Response Size (Streaming)
|
|
183
|
-
let bytesReceived = 0;
|
|
184
|
-
try {
|
|
185
|
-
const body = status === 304 ? '' : await ResponseLimiter.streamToString(
|
|
186
|
-
result.body,
|
|
187
|
-
maxBytes,
|
|
188
|
-
(bytes) => { bytesReceived = bytes; }
|
|
189
|
-
);
|
|
190
|
-
|
|
191
|
-
return {
|
|
192
|
-
status,
|
|
193
|
-
headers: resHeaders,
|
|
194
|
-
body,
|
|
195
|
-
redirectChain,
|
|
196
|
-
etag,
|
|
197
|
-
lastModified,
|
|
198
|
-
finalUrl: currentUrl,
|
|
199
|
-
retries: totalRetries,
|
|
200
|
-
bytesReceived
|
|
201
|
-
};
|
|
202
|
-
} catch (e: any) {
|
|
203
|
-
if (e.message === 'Oversized response') {
|
|
204
|
-
return {
|
|
205
|
-
status: 'oversized',
|
|
206
|
-
headers: resHeaders,
|
|
207
|
-
body: '',
|
|
208
|
-
redirectChain,
|
|
209
|
-
etag: null,
|
|
210
|
-
lastModified: null,
|
|
211
|
-
finalUrl: currentUrl,
|
|
212
|
-
retries: totalRetries,
|
|
213
|
-
bytesReceived
|
|
214
|
-
};
|
|
215
|
-
}
|
|
216
|
-
throw e;
|
|
217
|
-
}
|
|
218
|
-
|
|
219
|
-
} catch (error: any) {
|
|
220
|
-
// Map common network errors to specific statuses if needed
|
|
221
|
-
const isProxyError = error.message?.toLowerCase().includes('proxy') || error.code === 'ECONNREFUSED';
|
|
222
|
-
|
|
223
|
-
if (error.code === 'EBLOCKED' || error.message?.includes('Blocked internal IP')) {
|
|
224
|
-
return this.errorResult('blocked_internal_ip', currentUrl, redirectChain, totalRetries);
|
|
225
|
-
}
|
|
226
|
-
|
|
227
|
-
const finalStatus = isProxyError ? 'proxy_connection_failed' : 'network_error';
|
|
228
|
-
|
|
229
|
-
return this.errorResult(
|
|
230
|
-
totalRetries >= RetryPolicy.DEFAULT_CONFIG.maxRetries ? 'failed_after_retries' : finalStatus,
|
|
231
|
-
currentUrl,
|
|
232
|
-
redirectChain,
|
|
233
|
-
totalRetries
|
|
234
|
-
);
|
|
235
|
-
}
|
|
236
|
-
}
|
|
237
|
-
}
|
|
238
|
-
|
|
239
|
-
private errorResult(status: any, finalUrl: string, redirectChain: RedirectStep[], retries: number): FetchResult {
|
|
240
|
-
return {
|
|
241
|
-
status,
|
|
242
|
-
headers: {},
|
|
243
|
-
body: '',
|
|
244
|
-
redirectChain,
|
|
245
|
-
etag: null,
|
|
246
|
-
lastModified: null,
|
|
247
|
-
finalUrl,
|
|
248
|
-
retries
|
|
249
|
-
};
|
|
250
|
-
}
|
|
251
|
-
}
|
|
@@ -1,137 +0,0 @@
|
|
|
1
|
-
import { getDb } from '../db/index.js';
|
|
2
|
-
import { loadGraphFromSnapshot } from '../db/graphLoader.js';
|
|
3
|
-
import { MetricsRepository } from '../db/repositories/MetricsRepository.js';
|
|
4
|
-
import { SnapshotRepository } from '../db/repositories/SnapshotRepository.js';
|
|
5
|
-
import { PageRepository } from '../db/repositories/PageRepository.js';
|
|
6
|
-
import { computePageRank } from '../graph/pagerank.js';
|
|
7
|
-
import { calculateMetrics } from '../graph/metrics.js';
|
|
8
|
-
import { computeHITS } from '../scoring/hits.js';
|
|
9
|
-
import { EngineContext } from '../events.js';
|
|
10
|
-
import { calculateHealthScore, collectCrawlIssues } from '../scoring/health.js';
|
|
11
|
-
|
|
12
|
-
import { Graph } from '../graph/graph.js';
|
|
13
|
-
|
|
14
|
-
export function runPostCrawlMetrics(snapshotId: number, maxDepth: number, context?: EngineContext, limitReached: boolean = false, graphInstance?: Graph) {
|
|
15
|
-
const db = getDb();
|
|
16
|
-
const metricsRepo = new MetricsRepository(db);
|
|
17
|
-
const snapshotRepo = new SnapshotRepository(db);
|
|
18
|
-
const pageRepo = new PageRepository(db);
|
|
19
|
-
|
|
20
|
-
const graph = graphInstance || loadGraphFromSnapshot(snapshotId);
|
|
21
|
-
|
|
22
|
-
// Fallback emitter
|
|
23
|
-
const emit = (event: any) => {
|
|
24
|
-
if (context) {
|
|
25
|
-
context.emit(event);
|
|
26
|
-
} else {
|
|
27
|
-
if (event.type === 'error') console.error(event.message);
|
|
28
|
-
else if (event.type !== 'debug') console.log(event.message || event.phase);
|
|
29
|
-
}
|
|
30
|
-
};
|
|
31
|
-
|
|
32
|
-
const snapshot = snapshotRepo.getSnapshot(snapshotId);
|
|
33
|
-
if (!snapshot) {
|
|
34
|
-
emit({ type: 'error', message: `Snapshot ${snapshotId} not found` });
|
|
35
|
-
return;
|
|
36
|
-
}
|
|
37
|
-
|
|
38
|
-
if (!graphInstance) {
|
|
39
|
-
emit({ type: 'metrics:start', phase: 'Loading graph' });
|
|
40
|
-
}
|
|
41
|
-
|
|
42
|
-
emit({ type: 'metrics:start', phase: 'Computing PageRank' });
|
|
43
|
-
computePageRank(graph);
|
|
44
|
-
|
|
45
|
-
emit({ type: 'metrics:start', phase: 'Computing HITS' });
|
|
46
|
-
computeHITS(graph);
|
|
47
|
-
|
|
48
|
-
emit({ type: 'metrics:start', phase: 'Updating metrics in DB' });
|
|
49
|
-
const nodes = graph.getNodes();
|
|
50
|
-
|
|
51
|
-
// Pre-fetch all page IDs to avoid N+1 queries
|
|
52
|
-
// Use getPagesIdentityBySnapshot to avoid loading full page content (HTML) into memory again
|
|
53
|
-
const pages = pageRepo.getPagesIdentityBySnapshot(snapshotId);
|
|
54
|
-
const urlToId = new Map<string, number>();
|
|
55
|
-
for (const p of pages) {
|
|
56
|
-
urlToId.set(p.normalized_url, p.id);
|
|
57
|
-
}
|
|
58
|
-
|
|
59
|
-
const clusterStmt = db.prepare(`
|
|
60
|
-
INSERT OR REPLACE INTO duplicate_clusters (id, snapshot_id, type, size, representative, severity)
|
|
61
|
-
VALUES (?, ?, ?, ?, ?, ?)
|
|
62
|
-
`);
|
|
63
|
-
|
|
64
|
-
const contentStmt = db.prepare(`
|
|
65
|
-
INSERT OR REPLACE INTO content_clusters (id, snapshot_id, count, primary_url, risk, shared_path_prefix)
|
|
66
|
-
VALUES (?, ?, ?, ?, ?, ?)
|
|
67
|
-
`);
|
|
68
|
-
|
|
69
|
-
const tx = db.transaction(() => {
|
|
70
|
-
for (const node of nodes) {
|
|
71
|
-
const pageId = urlToId.get(node.url);
|
|
72
|
-
if (!pageId) continue;
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
metricsRepo.insertMetrics({
|
|
76
|
-
snapshot_id: snapshotId,
|
|
77
|
-
page_id: pageId,
|
|
78
|
-
authority_score: node.authorityScore ?? null,
|
|
79
|
-
hub_score: node.hubScore ?? null,
|
|
80
|
-
pagerank: node.pageRank ?? null,
|
|
81
|
-
pagerank_score: node.pageRankScore ?? null,
|
|
82
|
-
link_role: node.linkRole ?? null,
|
|
83
|
-
crawl_status: node.crawlStatus ?? null,
|
|
84
|
-
word_count: node.wordCount ?? null,
|
|
85
|
-
thin_content_score: node.thinContentScore ?? null,
|
|
86
|
-
external_link_ratio: node.externalLinkRatio ?? null,
|
|
87
|
-
orphan_score: node.orphanScore ?? null,
|
|
88
|
-
duplicate_cluster_id: node.duplicateClusterId ?? null,
|
|
89
|
-
duplicate_type: node.duplicateType ?? null,
|
|
90
|
-
is_cluster_primary: node.isClusterPrimary ? 1 : 0
|
|
91
|
-
});
|
|
92
|
-
|
|
93
|
-
// Update page-level crawl trap data
|
|
94
|
-
if (node.crawlTrapFlag || node.redirectChain?.length || node.bytesReceived) {
|
|
95
|
-
pageRepo.upsertPage({
|
|
96
|
-
site_id: snapshot.site_id,
|
|
97
|
-
normalized_url: node.url,
|
|
98
|
-
last_seen_snapshot_id: snapshotId,
|
|
99
|
-
redirect_chain: node.redirectChain ? JSON.stringify(node.redirectChain) : null,
|
|
100
|
-
bytes_received: node.bytesReceived ?? null,
|
|
101
|
-
crawl_trap_flag: node.crawlTrapFlag ? 1 : 0,
|
|
102
|
-
crawl_trap_risk: node.crawlTrapRisk ?? null,
|
|
103
|
-
trap_type: node.trapType ?? null,
|
|
104
|
-
});
|
|
105
|
-
}
|
|
106
|
-
}
|
|
107
|
-
|
|
108
|
-
// Save duplicate clusters
|
|
109
|
-
for (const cluster of graph.duplicateClusters) {
|
|
110
|
-
clusterStmt.run(cluster.id, snapshotId, cluster.type, cluster.size, cluster.representative, cluster.severity);
|
|
111
|
-
}
|
|
112
|
-
|
|
113
|
-
// Save content clusters
|
|
114
|
-
for (const cluster of graph.contentClusters) {
|
|
115
|
-
contentStmt.run(cluster.id, snapshotId, cluster.count, cluster.primaryUrl, cluster.risk, cluster.sharedPathPrefix ?? null);
|
|
116
|
-
}
|
|
117
|
-
});
|
|
118
|
-
tx();
|
|
119
|
-
|
|
120
|
-
emit({ type: 'metrics:start', phase: 'Computing aggregate stats' });
|
|
121
|
-
const metrics = calculateMetrics(graph, maxDepth);
|
|
122
|
-
|
|
123
|
-
// Calculate penalty-based health score (matches CLI)
|
|
124
|
-
const issues = collectCrawlIssues(graph, metrics);
|
|
125
|
-
const health = calculateHealthScore(metrics.totalPages, issues);
|
|
126
|
-
|
|
127
|
-
snapshotRepo.updateSnapshotStatus(snapshotId, 'completed', {
|
|
128
|
-
node_count: metrics.totalPages,
|
|
129
|
-
edge_count: metrics.totalEdges,
|
|
130
|
-
health_score: health.score,
|
|
131
|
-
orphan_count: issues.orphanPages,
|
|
132
|
-
thin_content_count: issues.thinContent,
|
|
133
|
-
limit_reached: limitReached ? 1 : 0
|
|
134
|
-
});
|
|
135
|
-
|
|
136
|
-
emit({ type: 'metrics:complete', durationMs: 0 });
|
|
137
|
-
}
|
package/src/crawler/normalize.ts
DELETED
|
@@ -1,108 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Normalizes a URL string based on specific rules.
|
|
3
|
-
*/
|
|
4
|
-
export interface NormalizeOptions {
|
|
5
|
-
stripQuery?: boolean;
|
|
6
|
-
}
|
|
7
|
-
|
|
8
|
-
const TRACKING_PARAMS = new Set([
|
|
9
|
-
'utm_source',
|
|
10
|
-
'utm_medium',
|
|
11
|
-
'utm_campaign',
|
|
12
|
-
'utm_term',
|
|
13
|
-
'utm_content',
|
|
14
|
-
'fbclid',
|
|
15
|
-
'gclid',
|
|
16
|
-
'msclkid'
|
|
17
|
-
]);
|
|
18
|
-
|
|
19
|
-
const SKIP_EXTENSIONS = new Set([
|
|
20
|
-
'.pdf', '.jpg', '.png', '.svg', '.webp', '.gif',
|
|
21
|
-
'.zip', '.xml', '.json', '.mp4'
|
|
22
|
-
]);
|
|
23
|
-
|
|
24
|
-
export function normalizeUrl(input: string, base: string, options: NormalizeOptions = {}): string | null {
|
|
25
|
-
try {
|
|
26
|
-
// 1. Resolve absolute URL
|
|
27
|
-
let u: URL;
|
|
28
|
-
if (base) {
|
|
29
|
-
u = new URL(input, base);
|
|
30
|
-
} else {
|
|
31
|
-
u = new URL(input);
|
|
32
|
-
}
|
|
33
|
-
|
|
34
|
-
// 2. Allow only http/https
|
|
35
|
-
if (u.protocol !== 'http:' && u.protocol !== 'https:') {
|
|
36
|
-
return null;
|
|
37
|
-
}
|
|
38
|
-
|
|
39
|
-
// 3. Lowercase hostname
|
|
40
|
-
u.hostname = u.hostname.toLowerCase();
|
|
41
|
-
|
|
42
|
-
// 4. Remove default ports
|
|
43
|
-
if ((u.protocol === 'http:' && u.port === '80') || (u.protocol === 'https:' && u.port === '443')) {
|
|
44
|
-
u.port = '';
|
|
45
|
-
}
|
|
46
|
-
|
|
47
|
-
// 5. Remove hash fragments
|
|
48
|
-
u.hash = '';
|
|
49
|
-
|
|
50
|
-
// 6. Query params handling
|
|
51
|
-
const params = new URLSearchParams(u.search);
|
|
52
|
-
const newParams = new URLSearchParams();
|
|
53
|
-
|
|
54
|
-
// Check if we should strip all query params
|
|
55
|
-
if (options.stripQuery) {
|
|
56
|
-
u.search = '';
|
|
57
|
-
} else {
|
|
58
|
-
// Filter tracking params
|
|
59
|
-
let hasParams = false;
|
|
60
|
-
for (const [key, value] of params) {
|
|
61
|
-
// Remove utm_* and other tracking params
|
|
62
|
-
if (key.startsWith('utm_') || TRACKING_PARAMS.has(key)) {
|
|
63
|
-
continue;
|
|
64
|
-
}
|
|
65
|
-
newParams.append(key, value);
|
|
66
|
-
hasParams = true;
|
|
67
|
-
}
|
|
68
|
-
|
|
69
|
-
// Sort for consistency
|
|
70
|
-
newParams.sort();
|
|
71
|
-
|
|
72
|
-
if (hasParams || newParams.toString()) {
|
|
73
|
-
u.search = newParams.toString();
|
|
74
|
-
} else {
|
|
75
|
-
u.search = '';
|
|
76
|
-
}
|
|
77
|
-
}
|
|
78
|
-
|
|
79
|
-
// 7. Normalize trailing slash
|
|
80
|
-
// 8. Collapse duplicate slashes in pathname
|
|
81
|
-
let pathname = u.pathname;
|
|
82
|
-
|
|
83
|
-
// Collapse duplicate slashes
|
|
84
|
-
pathname = pathname.replace(/\/+/g, '/');
|
|
85
|
-
|
|
86
|
-
// Remove trailing slash unless root
|
|
87
|
-
if (pathname.length > 1 && pathname.endsWith('/')) {
|
|
88
|
-
pathname = pathname.slice(0, -1);
|
|
89
|
-
}
|
|
90
|
-
|
|
91
|
-
u.pathname = pathname;
|
|
92
|
-
|
|
93
|
-
// 9. Skip non-HTML assets by extension
|
|
94
|
-
const lastDotIndex = u.pathname.lastIndexOf('.');
|
|
95
|
-
if (lastDotIndex !== -1) {
|
|
96
|
-
const ext = u.pathname.slice(lastDotIndex).toLowerCase();
|
|
97
|
-
if (SKIP_EXTENSIONS.has(ext)) {
|
|
98
|
-
return null;
|
|
99
|
-
}
|
|
100
|
-
}
|
|
101
|
-
|
|
102
|
-
// 10. Return final string
|
|
103
|
-
return u.toString();
|
|
104
|
-
|
|
105
|
-
} catch (_e) {
|
|
106
|
-
return null;
|
|
107
|
-
}
|
|
108
|
-
}
|
package/src/crawler/parser.ts
DELETED
|
@@ -1,190 +0,0 @@
|
|
|
1
|
-
import * as cheerio from 'cheerio';
|
|
2
|
-
import crypto from 'node:crypto';
|
|
3
|
-
import { normalizeUrl } from './normalize.js';
|
|
4
|
-
import { SimHash } from '../graph/simhash.js';
|
|
5
|
-
|
|
6
|
-
export interface ParseLink {
|
|
7
|
-
url: string;
|
|
8
|
-
weight: number;
|
|
9
|
-
}
|
|
10
|
-
|
|
11
|
-
export interface ParseResult {
|
|
12
|
-
links: ParseLink[];
|
|
13
|
-
html: string;
|
|
14
|
-
canonical: string | null;
|
|
15
|
-
noindex: boolean;
|
|
16
|
-
nofollow: boolean;
|
|
17
|
-
contentHash: string;
|
|
18
|
-
simhash?: string;
|
|
19
|
-
uniqueTokenRatio?: number;
|
|
20
|
-
soft404Score: number;
|
|
21
|
-
soft404Signals: string[];
|
|
22
|
-
}
|
|
23
|
-
|
|
24
|
-
export class Parser {
|
|
25
|
-
/**
|
|
26
|
-
* Parses HTML content to extract metadata and links.
|
|
27
|
-
*/
|
|
28
|
-
parse(html: string, baseUrl: string, status: number): ParseResult {
|
|
29
|
-
const $ = cheerio.load(html);
|
|
30
|
-
|
|
31
|
-
// 1. Robots Meta
|
|
32
|
-
let noindex = false;
|
|
33
|
-
let nofollow = false;
|
|
34
|
-
const robotsMeta = $('meta[name="robots"]').attr('content');
|
|
35
|
-
if (robotsMeta) {
|
|
36
|
-
const directives = robotsMeta.toLowerCase().split(',').map(s => s.trim());
|
|
37
|
-
if (directives.includes('noindex') || directives.includes('none')) noindex = true;
|
|
38
|
-
if (directives.includes('nofollow') || directives.includes('none')) nofollow = true;
|
|
39
|
-
}
|
|
40
|
-
|
|
41
|
-
// 2. Canonical
|
|
42
|
-
let canonical: string | null = null;
|
|
43
|
-
const canonicalLink = $('link[rel="canonical"]').attr('href');
|
|
44
|
-
if (canonicalLink) {
|
|
45
|
-
try {
|
|
46
|
-
// Resolve relative canonicals
|
|
47
|
-
const u = new URL(canonicalLink, baseUrl);
|
|
48
|
-
// Normalize minimally (remove default ports, lowercase host, etc)
|
|
49
|
-
// We don't strip query by default for canonical as it might be relevant
|
|
50
|
-
canonical = normalizeUrl(u.toString(), '', { stripQuery: false });
|
|
51
|
-
} catch (_e) {
|
|
52
|
-
// Invalid canonical URL, ignore
|
|
53
|
-
}
|
|
54
|
-
}
|
|
55
|
-
|
|
56
|
-
// 3. Links
|
|
57
|
-
const links = new Map<string, number>();
|
|
58
|
-
if (!nofollow) { // Don't extract links if nofollow is set
|
|
59
|
-
$('a').each((_, element) => {
|
|
60
|
-
const href = $(element).attr('href');
|
|
61
|
-
const rel = $(element).attr('rel');
|
|
62
|
-
const isNofollow = rel && rel.toLowerCase().includes('nofollow');
|
|
63
|
-
|
|
64
|
-
if (href && !isNofollow) {
|
|
65
|
-
try {
|
|
66
|
-
const absoluteUrl = new URL(href, baseUrl);
|
|
67
|
-
if (absoluteUrl.protocol === 'http:' || absoluteUrl.protocol === 'https:') {
|
|
68
|
-
absoluteUrl.hash = '';
|
|
69
|
-
const urlStr = absoluteUrl.toString();
|
|
70
|
-
|
|
71
|
-
// Calculate Weight
|
|
72
|
-
let weight = 1.0; // Default: Body
|
|
73
|
-
|
|
74
|
-
// Semantic Check
|
|
75
|
-
const $el = $(element);
|
|
76
|
-
if ($el.closest('nav').length > 0 || $el.closest('header').length > 0) {
|
|
77
|
-
weight = 0.7;
|
|
78
|
-
} else if ($el.closest('footer').length > 0) {
|
|
79
|
-
weight = 0.4;
|
|
80
|
-
} else {
|
|
81
|
-
// Secondary check: Common attributes
|
|
82
|
-
const parentText = ($el.parent().attr('class') || '') + ($el.parent().attr('id') || '');
|
|
83
|
-
const grandParentText = ($el.parent().parent().attr('class') || '') + ($el.parent().parent().attr('id') || '');
|
|
84
|
-
const combinedContext = (parentText + grandParentText).toLowerCase();
|
|
85
|
-
|
|
86
|
-
if (combinedContext.includes('nav') || combinedContext.includes('menu')) {
|
|
87
|
-
weight = 0.7;
|
|
88
|
-
} else if (combinedContext.includes('footer')) {
|
|
89
|
-
weight = 0.4;
|
|
90
|
-
}
|
|
91
|
-
}
|
|
92
|
-
|
|
93
|
-
// Store highest weight if multiple links to same URL
|
|
94
|
-
const currentMax = links.get(urlStr) || 0;
|
|
95
|
-
if (weight > currentMax) {
|
|
96
|
-
links.set(urlStr, weight);
|
|
97
|
-
}
|
|
98
|
-
}
|
|
99
|
-
} catch (_e) {
|
|
100
|
-
// Invalid URL
|
|
101
|
-
}
|
|
102
|
-
}
|
|
103
|
-
});
|
|
104
|
-
}
|
|
105
|
-
|
|
106
|
-
// 4. Content Hash (ignoring script/style/comments)
|
|
107
|
-
// Clone body to avoid modifying the loaded doc (though we don't reuse it)
|
|
108
|
-
// Actually cheerio load gives us a fresh instance.
|
|
109
|
-
$('script').remove();
|
|
110
|
-
$('style').remove();
|
|
111
|
-
$('noscript').remove();
|
|
112
|
-
$('iframe').remove();
|
|
113
|
-
|
|
114
|
-
const cleanText = $('body').text().replace(/\s+/g, ' ').trim();
|
|
115
|
-
const contentHash = crypto.createHash('sha256').update(cleanText).digest('hex');
|
|
116
|
-
|
|
117
|
-
// 4b. Simhash & Token calculation (limit to 50k chars for performance)
|
|
118
|
-
const limitedText = cleanText.substring(0, 50000).toLowerCase();
|
|
119
|
-
const tokens = limitedText.split(/\W+/).filter(t => t.length > 0);
|
|
120
|
-
const uniqueTokens = new Set(tokens);
|
|
121
|
-
const uniqueTokenRatio = tokens.length > 0 ? (uniqueTokens.size / tokens.length) : 0;
|
|
122
|
-
const simhash = SimHash.generate(tokens).toString();
|
|
123
|
-
|
|
124
|
-
// 5. Soft 404 Detection
|
|
125
|
-
let soft404Score = 0;
|
|
126
|
-
const soft404Signals: string[] = [];
|
|
127
|
-
|
|
128
|
-
if (status === 200) {
|
|
129
|
-
const title = $('title').text().toLowerCase();
|
|
130
|
-
const h1Text = $('h1').first().text().toLowerCase();
|
|
131
|
-
const bodyText = cleanText.toLowerCase();
|
|
132
|
-
|
|
133
|
-
const errorPatterns = ['404', 'not found', 'error', 'doesn\'t exist', 'unavailable', 'invalid'];
|
|
134
|
-
|
|
135
|
-
// Pattern checks
|
|
136
|
-
for (const pattern of errorPatterns) {
|
|
137
|
-
if (title.includes(pattern)) {
|
|
138
|
-
soft404Score += 0.4;
|
|
139
|
-
soft404Signals.push(`title_pattern_${pattern.replace(/\s+/g, '_')}`);
|
|
140
|
-
break;
|
|
141
|
-
}
|
|
142
|
-
}
|
|
143
|
-
|
|
144
|
-
for (const pattern of errorPatterns) {
|
|
145
|
-
if (h1Text.includes(pattern)) {
|
|
146
|
-
soft404Score += 0.3;
|
|
147
|
-
soft404Signals.push(`h1_pattern_${pattern.replace(/\s+/g, '_')}`);
|
|
148
|
-
break;
|
|
149
|
-
}
|
|
150
|
-
}
|
|
151
|
-
|
|
152
|
-
if (bodyText.includes('page not found') || bodyText.includes('404 error')) {
|
|
153
|
-
soft404Score += 0.2;
|
|
154
|
-
soft404Signals.push('body_error_phrase');
|
|
155
|
-
}
|
|
156
|
-
|
|
157
|
-
// Content length check (Word count approximation)
|
|
158
|
-
const words = cleanText.split(/\s+/).filter(w => w.length > 0);
|
|
159
|
-
if (words.length < 50) {
|
|
160
|
-
soft404Score += 0.3;
|
|
161
|
-
soft404Signals.push('very_low_word_count');
|
|
162
|
-
} else if (words.length < 150) {
|
|
163
|
-
soft404Score += 0.1;
|
|
164
|
-
soft404Signals.push('low_word_count');
|
|
165
|
-
}
|
|
166
|
-
|
|
167
|
-
// Link count check
|
|
168
|
-
if (links.size === 0) {
|
|
169
|
-
soft404Score += 0.2;
|
|
170
|
-
soft404Signals.push('no_outbound_links');
|
|
171
|
-
}
|
|
172
|
-
|
|
173
|
-
// Cap at 1.0
|
|
174
|
-
soft404Score = Math.min(1.0, soft404Score);
|
|
175
|
-
}
|
|
176
|
-
|
|
177
|
-
return {
|
|
178
|
-
links: Array.from(links.entries()).map(([url, weight]) => ({ url, weight })),
|
|
179
|
-
html: html, // pass raw HTML for analysis
|
|
180
|
-
canonical,
|
|
181
|
-
noindex,
|
|
182
|
-
nofollow,
|
|
183
|
-
contentHash,
|
|
184
|
-
simhash,
|
|
185
|
-
uniqueTokenRatio,
|
|
186
|
-
soft404Score,
|
|
187
|
-
soft404Signals
|
|
188
|
-
}
|
|
189
|
-
}
|
|
190
|
-
}
|