recker 1.0.93 → 1.0.94-next.83dffd9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/browser/scrape/index.d.ts +3 -1
- package/dist/browser/scrape/index.js +2 -0
- package/dist/browser/scrape/spider.d.ts +36 -2
- package/dist/browser/scrape/spider.js +209 -58
- package/dist/browser/scrape/sqlite-crawl-queue.d.ts +24 -0
- package/dist/browser/scrape/sqlite-crawl-queue.js +118 -0
- package/dist/browser/scrape/sqlite-crawl-storage.d.ts +26 -0
- package/dist/browser/scrape/sqlite-crawl-storage.js +76 -0
- package/dist/browser/seo/seo-spider.d.ts +2 -0
- package/dist/browser/seo/seo-spider.js +17 -3
- package/dist/cli/commands/spider-runner.js +2 -1
- package/dist/cli/index.js +0 -3
- package/dist/cli/tui/jobs/spider-job.js +2 -1
- package/dist/mcp/prompts/index.js +15 -6
- package/dist/scrape/index.d.ts +3 -1
- package/dist/scrape/index.js +2 -0
- package/dist/scrape/spider.d.ts +36 -2
- package/dist/scrape/spider.js +209 -58
- package/dist/scrape/sqlite-crawl-queue.d.ts +24 -0
- package/dist/scrape/sqlite-crawl-queue.js +118 -0
- package/dist/scrape/sqlite-crawl-storage.d.ts +26 -0
- package/dist/scrape/sqlite-crawl-storage.js +76 -0
- package/dist/seo/seo-spider.d.ts +2 -0
- package/dist/seo/seo-spider.js +17 -3
- package/dist/version.js +1 -1
- package/package.json +7 -1
|
@@ -3,10 +3,12 @@ export type { Options as ParserOptions } from './parser/index.js';
|
|
|
3
3
|
export { ScrapeDocument } from './document.js';
|
|
4
4
|
export { ScrapeElement } from './element.js';
|
|
5
5
|
export { Spider, spider } from './spider.js';
|
|
6
|
-
export type { SpiderOptions, SpiderPageResult, SpiderProgress, SpiderResult, } from './spider.js';
|
|
6
|
+
export type { SpiderOptions, SpiderPageResult, SpiderPageEvent, SpiderProgress, SpiderResult, } from './spider.js';
|
|
7
7
|
export { InMemoryCrawlQueue } from './crawl-queue.js';
|
|
8
|
+
export { SqliteCrawlQueue } from './sqlite-crawl-queue.js';
|
|
8
9
|
export type { CrawlQueueAdapter, CrawlQueueItem } from './crawl-queue.js';
|
|
9
10
|
export { InMemoryCrawlStorage } from './crawl-storage.js';
|
|
11
|
+
export { SqliteCrawlStorage } from './sqlite-crawl-storage.js';
|
|
10
12
|
export type { CrawlStorageAdapter } from './crawl-storage.js';
|
|
11
13
|
export { ListProxyAdapter } from './proxy-adapter.js';
|
|
12
14
|
export type { ProxyAdapter } from './proxy-adapter.js';
|
|
@@ -3,6 +3,8 @@ export { ScrapeDocument } from './document.js';
|
|
|
3
3
|
export { ScrapeElement } from './element.js';
|
|
4
4
|
export { Spider, spider } from './spider.js';
|
|
5
5
|
export { InMemoryCrawlQueue } from './crawl-queue.js';
|
|
6
|
+
export { SqliteCrawlQueue } from './sqlite-crawl-queue.js';
|
|
6
7
|
export { InMemoryCrawlStorage } from './crawl-storage.js';
|
|
8
|
+
export { SqliteCrawlStorage } from './sqlite-crawl-storage.js';
|
|
7
9
|
export { ListProxyAdapter } from './proxy-adapter.js';
|
|
8
10
|
export { extractLinks, extractImages, extractMeta, extractOpenGraph, extractTwitterCard, extractJsonLd, extractForms, extractTables, extractScripts, extractStyles, } from './extractors.js';
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import { ScrapeDocument } from './document.js';
|
|
1
2
|
import type { ExtractedLink, ExtractionSchema } from './types.js';
|
|
2
3
|
import type { Options as ParserOptions } from './parser/index.js';
|
|
3
4
|
import { type SitemapUrl } from '../seo/validators/sitemap.js';
|
|
@@ -31,8 +32,7 @@ export interface SpiderOptions {
|
|
|
31
32
|
proxy?: string | string[] | ProxyAdapter;
|
|
32
33
|
transport?: SpiderTransport;
|
|
33
34
|
preferCurlFirst?: boolean;
|
|
34
|
-
onPage?: (
|
|
35
|
-
onPageWithHtml?: (result: SpiderPageResult, html: string) => void | Promise<void>;
|
|
35
|
+
onPage?: (event: SpiderPageEvent) => void | Promise<void>;
|
|
36
36
|
onCaptchaDetected?: (result: {
|
|
37
37
|
url: string;
|
|
38
38
|
status: number;
|
|
@@ -40,9 +40,31 @@ export interface SpiderOptions {
|
|
|
40
40
|
provider?: CaptchaProvider;
|
|
41
41
|
usedCurl: boolean;
|
|
42
42
|
}) => void | Promise<void>;
|
|
43
|
+
onBlocked?: (result: SpiderPageResult) => void | Promise<void>;
|
|
44
|
+
onError?: (result: SpiderPageResult) => void | Promise<void>;
|
|
45
|
+
onRetry?: (info: {
|
|
46
|
+
url: string;
|
|
47
|
+
attempt: number;
|
|
48
|
+
maxAttempts: number;
|
|
49
|
+
reason?: string;
|
|
50
|
+
delay: number;
|
|
51
|
+
transport: SpiderTransport;
|
|
52
|
+
previousStatus: number;
|
|
53
|
+
timings?: SpiderPageResult['timings'];
|
|
54
|
+
}) => void | Promise<void>;
|
|
55
|
+
onRedirect?: (info: {
|
|
56
|
+
from: string;
|
|
57
|
+
to: string;
|
|
58
|
+
status: number;
|
|
59
|
+
}) => void | Promise<void>;
|
|
43
60
|
onProgress?: (progress: SpiderProgress) => void;
|
|
44
61
|
extract?: string[] | ExtractionSchema;
|
|
45
62
|
parserOptions?: Partial<ParserOptions>;
|
|
63
|
+
domainRateLimit?: {
|
|
64
|
+
maxPerSecond?: number;
|
|
65
|
+
};
|
|
66
|
+
deduplicateContent?: boolean;
|
|
67
|
+
resume?: boolean;
|
|
46
68
|
crawlQueue?: CrawlQueueAdapter;
|
|
47
69
|
crawlStorage?: CrawlStorageAdapter;
|
|
48
70
|
}
|
|
@@ -102,6 +124,14 @@ export interface SpiderPageResult {
|
|
|
102
124
|
stylesheets: number;
|
|
103
125
|
};
|
|
104
126
|
extracted?: Record<string, unknown>;
|
|
127
|
+
contentHash?: string;
|
|
128
|
+
isDuplicate?: boolean;
|
|
129
|
+
duplicateOf?: string;
|
|
130
|
+
}
|
|
131
|
+
export interface SpiderPageEvent {
|
|
132
|
+
result: SpiderPageResult;
|
|
133
|
+
html?: string;
|
|
134
|
+
document?: () => Promise<ScrapeDocument>;
|
|
105
135
|
}
|
|
106
136
|
export interface SpiderProgress {
|
|
107
137
|
crawled: number;
|
|
@@ -162,7 +192,10 @@ export declare class Spider {
|
|
|
162
192
|
private baseHost;
|
|
163
193
|
private running;
|
|
164
194
|
private aborted;
|
|
195
|
+
private abortController;
|
|
165
196
|
private pendingCount;
|
|
197
|
+
private domainRequestTimestamps;
|
|
198
|
+
private contentHashes;
|
|
166
199
|
private blockedDomains;
|
|
167
200
|
private curlTransport;
|
|
168
201
|
private curlAvailable;
|
|
@@ -172,6 +205,7 @@ export declare class Spider {
|
|
|
172
205
|
private robotsData;
|
|
173
206
|
private sitemapValidation;
|
|
174
207
|
private robotsValidation;
|
|
208
|
+
private waitForDomainRateLimit;
|
|
175
209
|
private toHeaderRecord;
|
|
176
210
|
constructor(options?: SpiderOptions);
|
|
177
211
|
crawl(startUrl: string): Promise<SpiderResult>;
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import { createHash } from 'node:crypto';
|
|
1
2
|
import { performance } from 'node:perf_hooks';
|
|
2
3
|
import { createClient } from '../core/client.js';
|
|
3
4
|
import { ScrapeDocument } from './document.js';
|
|
@@ -39,10 +40,15 @@ const SEC_CH_PLATFORM_HINTS = [
|
|
|
39
40
|
function getHostname(url) {
|
|
40
41
|
return new URL(url).hostname;
|
|
41
42
|
}
|
|
42
|
-
function sleep(ms) {
|
|
43
|
+
function sleep(ms, signal) {
|
|
43
44
|
if (ms <= 0)
|
|
44
45
|
return Promise.resolve();
|
|
45
|
-
|
|
46
|
+
if (signal?.aborted)
|
|
47
|
+
return Promise.resolve();
|
|
48
|
+
return new Promise(resolve => {
|
|
49
|
+
const timer = setTimeout(resolve, ms);
|
|
50
|
+
signal?.addEventListener('abort', () => { clearTimeout(timer); resolve(); }, { once: true });
|
|
51
|
+
});
|
|
46
52
|
}
|
|
47
53
|
function getRetryAfterDelay(response) {
|
|
48
54
|
const retryAfter = response.headers.get('retry-after');
|
|
@@ -108,7 +114,8 @@ function shouldCrawl(url, baseHost, options) {
|
|
|
108
114
|
if (!['http:', 'https:'].includes(parsed.protocol)) {
|
|
109
115
|
return false;
|
|
110
116
|
}
|
|
111
|
-
|
|
117
|
+
const hostname = parsed.hostname.replace(/^www\./, '');
|
|
118
|
+
if (options.sameDomain !== false && hostname !== baseHost) {
|
|
112
119
|
return false;
|
|
113
120
|
}
|
|
114
121
|
const skipExtensions = [
|
|
@@ -188,7 +195,10 @@ export class Spider {
|
|
|
188
195
|
baseHost = '';
|
|
189
196
|
running = false;
|
|
190
197
|
aborted = false;
|
|
198
|
+
abortController = new AbortController();
|
|
191
199
|
pendingCount = 0;
|
|
200
|
+
domainRequestTimestamps = new Map();
|
|
201
|
+
contentHashes = new Map();
|
|
192
202
|
blockedDomains = new Set();
|
|
193
203
|
curlTransport = null;
|
|
194
204
|
curlAvailable = false;
|
|
@@ -198,6 +208,33 @@ export class Spider {
|
|
|
198
208
|
robotsData = null;
|
|
199
209
|
sitemapValidation = null;
|
|
200
210
|
robotsValidation = null;
|
|
211
|
+
async waitForDomainRateLimit(hostname) {
|
|
212
|
+
const limit = this.options.domainRateLimit?.maxPerSecond;
|
|
213
|
+
if (!limit || limit <= 0)
|
|
214
|
+
return;
|
|
215
|
+
const now = Date.now();
|
|
216
|
+
const window = 1000;
|
|
217
|
+
let timestamps = this.domainRequestTimestamps.get(hostname);
|
|
218
|
+
if (!timestamps) {
|
|
219
|
+
timestamps = [];
|
|
220
|
+
this.domainRequestTimestamps.set(hostname, timestamps);
|
|
221
|
+
}
|
|
222
|
+
while (timestamps.length > 0 && timestamps[0] <= now - window) {
|
|
223
|
+
timestamps.shift();
|
|
224
|
+
}
|
|
225
|
+
if (timestamps.length >= limit) {
|
|
226
|
+
const waitMs = timestamps[0] + window - now;
|
|
227
|
+
if (waitMs > 0)
|
|
228
|
+
await sleep(waitMs, this.abortController.signal);
|
|
229
|
+
if (this.aborted)
|
|
230
|
+
return;
|
|
231
|
+
const afterWait = Date.now();
|
|
232
|
+
while (timestamps.length > 0 && timestamps[0] <= afterWait - window) {
|
|
233
|
+
timestamps.shift();
|
|
234
|
+
}
|
|
235
|
+
}
|
|
236
|
+
timestamps.push(Date.now());
|
|
237
|
+
}
|
|
201
238
|
toHeaderRecord(headers) {
|
|
202
239
|
const headerRecord = {};
|
|
203
240
|
headers.forEach((value, key) => {
|
|
@@ -239,11 +276,17 @@ export class Spider {
|
|
|
239
276
|
exclude: options.exclude,
|
|
240
277
|
include: options.include,
|
|
241
278
|
onPage: options.onPage,
|
|
242
|
-
onPageWithHtml: options.onPageWithHtml,
|
|
243
279
|
onCaptchaDetected: options.onCaptchaDetected,
|
|
280
|
+
onBlocked: options.onBlocked,
|
|
281
|
+
onError: options.onError,
|
|
282
|
+
onRetry: options.onRetry,
|
|
283
|
+
onRedirect: options.onRedirect,
|
|
244
284
|
onProgress: options.onProgress,
|
|
245
285
|
extract: extractSchema,
|
|
246
286
|
parserOptions: options.parserOptions,
|
|
287
|
+
domainRateLimit: options.domainRateLimit,
|
|
288
|
+
deduplicateContent: options.deduplicateContent ?? false,
|
|
289
|
+
resume: options.resume ?? false,
|
|
247
290
|
};
|
|
248
291
|
if (options.proxy) {
|
|
249
292
|
if (typeof options.proxy === 'string') {
|
|
@@ -281,22 +324,41 @@ export class Spider {
|
|
|
281
324
|
const startTimestamp = Date.now();
|
|
282
325
|
const normalizedStart = normalizeUrl(startUrl);
|
|
283
326
|
const baseUrl = new URL(normalizedStart).origin;
|
|
284
|
-
this.baseHost = new URL(normalizedStart).hostname;
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
327
|
+
this.baseHost = new URL(normalizedStart).hostname.replace(/^www\./, '');
|
|
328
|
+
if (!this.options.resume) {
|
|
329
|
+
await this.crawlQueue.clear();
|
|
330
|
+
await this.crawlStorage.clear();
|
|
331
|
+
this._visitedCount = 0;
|
|
332
|
+
this._queueSize = 0;
|
|
333
|
+
this._resultCount = 0;
|
|
334
|
+
this.domainRequestTimestamps.clear();
|
|
335
|
+
this.contentHashes.clear();
|
|
336
|
+
}
|
|
337
|
+
else {
|
|
338
|
+
this._queueSize = await this.crawlQueue.size();
|
|
339
|
+
this._resultCount = await this.crawlStorage.getResultCount();
|
|
340
|
+
}
|
|
290
341
|
this.running = true;
|
|
291
342
|
this.aborted = false;
|
|
343
|
+
this.abortController = new AbortController();
|
|
292
344
|
this.pendingCount = 0;
|
|
293
345
|
this.sitemapUrls = [];
|
|
294
346
|
this.sitemapUrlSet.clear();
|
|
295
347
|
this.robotsData = null;
|
|
296
348
|
this.sitemapValidation = null;
|
|
297
349
|
this.robotsValidation = null;
|
|
298
|
-
this.
|
|
299
|
-
|
|
350
|
+
if (!this.options.resume) {
|
|
351
|
+
this.blockedDomains.clear();
|
|
352
|
+
this.domainStates.clear();
|
|
353
|
+
}
|
|
354
|
+
if (this.options.resume && this.options.deduplicateContent) {
|
|
355
|
+
const existingResults = await this.crawlStorage.getResults();
|
|
356
|
+
for (const r of existingResults) {
|
|
357
|
+
if (r.contentHash) {
|
|
358
|
+
this.contentHashes.set(r.contentHash, r.url);
|
|
359
|
+
}
|
|
360
|
+
}
|
|
361
|
+
}
|
|
300
362
|
if (this.options.transport !== 'undici') {
|
|
301
363
|
this.curlAvailable = await hasImpersonate();
|
|
302
364
|
if (this.options.transport === 'auto' && this.curlAvailable && isProtectedDomain(this.baseHost)) {
|
|
@@ -312,10 +374,12 @@ export class Spider {
|
|
|
312
374
|
const pending = new Map();
|
|
313
375
|
const scheduleUrl = async (item) => {
|
|
314
376
|
const normalized = normalizeUrl(item.url);
|
|
315
|
-
if (await this.crawlQueue.hasVisited(normalized))
|
|
316
|
-
return;
|
|
317
377
|
if (pending.has(normalized))
|
|
318
378
|
return;
|
|
379
|
+
if (await this.crawlQueue.hasVisited(normalized))
|
|
380
|
+
return;
|
|
381
|
+
await this.crawlQueue.markVisited(normalized);
|
|
382
|
+
this._visitedCount++;
|
|
319
383
|
if (item.depth > this.options.maxDepth)
|
|
320
384
|
return;
|
|
321
385
|
if (this._resultCount + pending.size >= this.options.maxPages)
|
|
@@ -331,8 +395,6 @@ export class Spider {
|
|
|
331
395
|
return;
|
|
332
396
|
}
|
|
333
397
|
}
|
|
334
|
-
await this.crawlQueue.markVisited(normalized);
|
|
335
|
-
this._visitedCount++;
|
|
336
398
|
this.pendingCount++;
|
|
337
399
|
const promise = this.pool.run(() => this.crawlPage({ ...item, url: normalized }))
|
|
338
400
|
.finally(() => {
|
|
@@ -341,16 +403,18 @@ export class Spider {
|
|
|
341
403
|
});
|
|
342
404
|
pending.set(normalized, promise);
|
|
343
405
|
};
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
406
|
+
if (!this.options.resume) {
|
|
407
|
+
await scheduleUrl({ url: normalizedStart, depth: 0 });
|
|
408
|
+
if (this.options.useSitemap && this.sitemapUrls.length > 0) {
|
|
409
|
+
for (const sitemapUrl of this.sitemapUrls) {
|
|
410
|
+
try {
|
|
411
|
+
const urlHost = new URL(sitemapUrl.loc).hostname;
|
|
412
|
+
if (urlHost === this.baseHost) {
|
|
413
|
+
await scheduleUrl({ url: sitemapUrl.loc, depth: 1 });
|
|
414
|
+
}
|
|
415
|
+
}
|
|
416
|
+
catch {
|
|
351
417
|
}
|
|
352
|
-
}
|
|
353
|
-
catch {
|
|
354
418
|
}
|
|
355
419
|
}
|
|
356
420
|
}
|
|
@@ -372,14 +436,33 @@ export class Spider {
|
|
|
372
436
|
if (pending.size > 0) {
|
|
373
437
|
await Promise.all(pending.values());
|
|
374
438
|
}
|
|
439
|
+
while (!this.aborted && this._resultCount < this.options.maxPages) {
|
|
440
|
+
const remaining = await this.crawlQueue.size();
|
|
441
|
+
if (remaining === 0 && pending.size === 0)
|
|
442
|
+
break;
|
|
443
|
+
this._queueSize = remaining;
|
|
444
|
+
let nextItem = await this.crawlQueue.pop();
|
|
445
|
+
while (nextItem && !this.aborted) {
|
|
446
|
+
this._queueSize = Math.max(0, this._queueSize - 1);
|
|
447
|
+
if (this._resultCount + pending.size >= this.options.maxPages)
|
|
448
|
+
break;
|
|
449
|
+
await scheduleUrl(nextItem);
|
|
450
|
+
nextItem = await this.crawlQueue.pop();
|
|
451
|
+
}
|
|
452
|
+
if (pending.size > 0) {
|
|
453
|
+
await Promise.all(pending.values());
|
|
454
|
+
}
|
|
455
|
+
}
|
|
375
456
|
this.running = false;
|
|
376
457
|
const pages = await this.crawlStorage.getResults();
|
|
377
458
|
const errors = await this.crawlStorage.getErrors();
|
|
378
459
|
const sitemapAnalysis = this.buildSitemapAnalysis(pages);
|
|
379
460
|
const robotsAnalysis = this.buildRobotsAnalysis();
|
|
380
461
|
const visited = this.crawlQueue instanceof InMemoryCrawlQueue
|
|
381
|
-
? this.crawlQueue.getVisited()
|
|
462
|
+
? new Set(this.crawlQueue.getVisited())
|
|
382
463
|
: new Set(pages.map(r => r.url));
|
|
464
|
+
await this.crawlQueue.close?.();
|
|
465
|
+
await this.crawlStorage.close?.();
|
|
383
466
|
return {
|
|
384
467
|
startUrl: normalizedStart,
|
|
385
468
|
pages,
|
|
@@ -571,6 +654,8 @@ export class Spider {
|
|
|
571
654
|
let lastRetryAfterMs = 0;
|
|
572
655
|
const proxyUrl = this.proxyAdapter ? await this.proxyAdapter.getProxy() : null;
|
|
573
656
|
const executeRequest = async (useCurl) => {
|
|
657
|
+
if (this.aborted)
|
|
658
|
+
throw new Error('Crawl aborted');
|
|
574
659
|
if (useCurl && this.curlTransport) {
|
|
575
660
|
const curlForRequest = proxyUrl
|
|
576
661
|
? new CurlTransport(proxyUrl)
|
|
@@ -611,6 +696,10 @@ export class Spider {
|
|
|
611
696
|
const clientForRequest = this.getClientForProxy(proxyUrl);
|
|
612
697
|
const response = await clientForRequest.get(url, {
|
|
613
698
|
headers: this.buildRequestHeaders(url, false),
|
|
699
|
+
signal: this.abortController.signal,
|
|
700
|
+
beforeRedirect: this.options.onRedirect
|
|
701
|
+
? (info) => { this.options.onRedirect({ from: info.from, to: info.to, status: info.status }); }
|
|
702
|
+
: undefined,
|
|
614
703
|
});
|
|
615
704
|
const contentType = response.headers.get('content-type') || '';
|
|
616
705
|
const shouldReadUndiciBody = !contentType ||
|
|
@@ -643,6 +732,7 @@ export class Spider {
|
|
|
643
732
|
};
|
|
644
733
|
};
|
|
645
734
|
for (let attempt = 0; attempt < maxAttempts; attempt++) {
|
|
735
|
+
await this.waitForDomainRateLimit(hostname);
|
|
646
736
|
await this.waitForDomainPenalty(hostname);
|
|
647
737
|
const useCurl = this.shouldUseCurlForHost(hostname, hasCurl, forcedTransport);
|
|
648
738
|
const transportForAttempt = useCurl ? 'curl' : 'undici';
|
|
@@ -732,7 +822,21 @@ export class Spider {
|
|
|
732
822
|
forcedTransport = 'curl';
|
|
733
823
|
}
|
|
734
824
|
}
|
|
735
|
-
|
|
825
|
+
if (this.options.onRetry) {
|
|
826
|
+
await this.options.onRetry({
|
|
827
|
+
url,
|
|
828
|
+
attempt: attempt + 1,
|
|
829
|
+
maxAttempts,
|
|
830
|
+
reason: attemptReason,
|
|
831
|
+
delay: waitMs,
|
|
832
|
+
transport: forcedTransport ?? transportForAttempt,
|
|
833
|
+
previousStatus: response.status,
|
|
834
|
+
timings,
|
|
835
|
+
});
|
|
836
|
+
}
|
|
837
|
+
if (this.aborted)
|
|
838
|
+
break;
|
|
839
|
+
await sleep(waitMs, this.abortController.signal);
|
|
736
840
|
continue;
|
|
737
841
|
}
|
|
738
842
|
catch (error) {
|
|
@@ -763,8 +867,10 @@ export class Spider {
|
|
|
763
867
|
forcedTransport = 'curl';
|
|
764
868
|
}
|
|
765
869
|
}
|
|
870
|
+
if (this.aborted)
|
|
871
|
+
break;
|
|
766
872
|
const waitMs = this.getRetryWait(hostname, attempt + 1);
|
|
767
|
-
await sleep(waitMs);
|
|
873
|
+
await sleep(waitMs, this.abortController.signal);
|
|
768
874
|
}
|
|
769
875
|
}
|
|
770
876
|
if (lastResponse) {
|
|
@@ -867,7 +973,21 @@ export class Spider {
|
|
|
867
973
|
};
|
|
868
974
|
await this.crawlStorage.saveResult(nonHtmlResult);
|
|
869
975
|
this._resultCount++;
|
|
870
|
-
this.options.onPage
|
|
976
|
+
if (this.options.onPage) {
|
|
977
|
+
let cachedDoc = null;
|
|
978
|
+
await this.options.onPage({
|
|
979
|
+
result: nonHtmlResult,
|
|
980
|
+
html: html || undefined,
|
|
981
|
+
document: html ? () => {
|
|
982
|
+
if (cachedDoc)
|
|
983
|
+
return Promise.resolve(cachedDoc);
|
|
984
|
+
return ScrapeDocument.create(html, { baseUrl: item.url, parserOptions: this.options.parserOptions }).then(d => { cachedDoc = d; return d; });
|
|
985
|
+
} : undefined,
|
|
986
|
+
});
|
|
987
|
+
}
|
|
988
|
+
if (this.options.onBlocked && (detection.blocked || hasCaptcha)) {
|
|
989
|
+
await this.options.onBlocked(nonHtmlResult);
|
|
990
|
+
}
|
|
871
991
|
return;
|
|
872
992
|
}
|
|
873
993
|
const doc = await ScrapeDocument.create(html, {
|
|
@@ -898,6 +1018,21 @@ export class Spider {
|
|
|
898
1018
|
catch {
|
|
899
1019
|
}
|
|
900
1020
|
}
|
|
1021
|
+
let isDuplicate = false;
|
|
1022
|
+
let duplicateOf;
|
|
1023
|
+
let contentHash;
|
|
1024
|
+
if (this.options.deduplicateContent) {
|
|
1025
|
+
const bodyText = doc.text('body');
|
|
1026
|
+
contentHash = createHash('md5').update(bodyText).digest('hex');
|
|
1027
|
+
const existingUrl = this.contentHashes.get(contentHash);
|
|
1028
|
+
if (existingUrl) {
|
|
1029
|
+
isDuplicate = true;
|
|
1030
|
+
duplicateOf = existingUrl;
|
|
1031
|
+
}
|
|
1032
|
+
else {
|
|
1033
|
+
this.contentHashes.set(contentHash, item.url);
|
|
1034
|
+
}
|
|
1035
|
+
}
|
|
901
1036
|
const result = {
|
|
902
1037
|
url: item.url,
|
|
903
1038
|
status,
|
|
@@ -926,42 +1061,52 @@ export class Spider {
|
|
|
926
1061
|
timings,
|
|
927
1062
|
fetchedAt,
|
|
928
1063
|
extracted,
|
|
1064
|
+
contentHash,
|
|
1065
|
+
isDuplicate: isDuplicate || undefined,
|
|
1066
|
+
duplicateOf,
|
|
929
1067
|
};
|
|
930
1068
|
await this.crawlStorage.saveResult(result);
|
|
931
1069
|
this._resultCount++;
|
|
932
|
-
this.options.onPage
|
|
933
|
-
|
|
934
|
-
|
|
935
|
-
|
|
936
|
-
|
|
937
|
-
|
|
938
|
-
for (const link of links) {
|
|
939
|
-
if (!link.href)
|
|
940
|
-
continue;
|
|
941
|
-
const normalized = normalizeUrl(link.href);
|
|
942
|
-
if (!shouldCrawl(normalized, this.baseHost, this.options))
|
|
943
|
-
continue;
|
|
944
|
-
candidateUrls.push(normalized);
|
|
945
|
-
candidates.push({ url: normalized, depth: item.depth + 1 });
|
|
1070
|
+
if (this.options.onPage) {
|
|
1071
|
+
await this.options.onPage({
|
|
1072
|
+
result,
|
|
1073
|
+
html,
|
|
1074
|
+
document: () => Promise.resolve(doc),
|
|
1075
|
+
});
|
|
946
1076
|
}
|
|
947
|
-
if (
|
|
948
|
-
const
|
|
949
|
-
|
|
950
|
-
|
|
951
|
-
|
|
952
|
-
|
|
953
|
-
|
|
954
|
-
|
|
955
|
-
|
|
956
|
-
|
|
957
|
-
|
|
958
|
-
|
|
1077
|
+
if (!isDuplicate) {
|
|
1078
|
+
const candidates = [];
|
|
1079
|
+
const candidateUrls = [];
|
|
1080
|
+
for (const link of links) {
|
|
1081
|
+
if (!link.href)
|
|
1082
|
+
continue;
|
|
1083
|
+
const normalized = normalizeUrl(link.href);
|
|
1084
|
+
if (!shouldCrawl(normalized, this.baseHost, this.options))
|
|
1085
|
+
continue;
|
|
1086
|
+
candidateUrls.push(normalized);
|
|
1087
|
+
candidates.push({ url: normalized, depth: item.depth + 1 });
|
|
1088
|
+
}
|
|
1089
|
+
if (candidates.length > 0) {
|
|
1090
|
+
const visitedSet = this.crawlQueue.hasVisitedBatch
|
|
1091
|
+
? await this.crawlQueue.hasVisitedBatch(candidateUrls)
|
|
1092
|
+
: new Set(await Promise.all(candidateUrls.map(async (u) => (await this.crawlQueue.hasVisited(u)) ? u : null)).then(r => r.filter(Boolean)));
|
|
1093
|
+
const newItems = candidates.filter((_, i) => !visitedSet.has(candidateUrls[i]));
|
|
1094
|
+
if (newItems.length > 0) {
|
|
1095
|
+
if (this.crawlQueue.pushBatch) {
|
|
1096
|
+
await this.crawlQueue.pushBatch(newItems);
|
|
1097
|
+
}
|
|
1098
|
+
else {
|
|
1099
|
+
for (const newItem of newItems)
|
|
1100
|
+
await this.crawlQueue.push(newItem);
|
|
1101
|
+
}
|
|
1102
|
+
this._queueSize += newItems.length;
|
|
959
1103
|
}
|
|
960
|
-
this._queueSize += newItems.length;
|
|
961
1104
|
}
|
|
962
1105
|
}
|
|
963
1106
|
}
|
|
964
1107
|
catch (error) {
|
|
1108
|
+
if (this.aborted)
|
|
1109
|
+
return;
|
|
965
1110
|
const errAttempts = typeof error?.attempts === 'number' ? error.attempts : 1;
|
|
966
1111
|
const errRetryCount = typeof error?.retryCount === 'number' ? error.retryCount : Math.max(0, errAttempts - 1);
|
|
967
1112
|
const errRetryAfterMs = typeof error?.retryAfterMs === 'number' ? error?.retryAfterMs : 0;
|
|
@@ -1026,7 +1171,12 @@ export class Spider {
|
|
|
1026
1171
|
await this.crawlStorage.saveResult(errorResult);
|
|
1027
1172
|
this._resultCount++;
|
|
1028
1173
|
await this.crawlStorage.saveError({ url: item.url, error: message });
|
|
1029
|
-
this.options.onPage
|
|
1174
|
+
if (this.options.onPage) {
|
|
1175
|
+
await this.options.onPage({ result: errorResult });
|
|
1176
|
+
}
|
|
1177
|
+
if (this.options.onError) {
|
|
1178
|
+
await this.options.onError(errorResult);
|
|
1179
|
+
}
|
|
1030
1180
|
}
|
|
1031
1181
|
}
|
|
1032
1182
|
getOrCreateDomainState(hostname) {
|
|
@@ -1146,7 +1296,7 @@ export class Spider {
|
|
|
1146
1296
|
const now = Date.now();
|
|
1147
1297
|
const delay = Math.max(state.penaltyUntil, state.challengeCooldownUntil) - now;
|
|
1148
1298
|
if (delay > 0) {
|
|
1149
|
-
await sleep(delay);
|
|
1299
|
+
await sleep(delay, this.abortController.signal);
|
|
1150
1300
|
}
|
|
1151
1301
|
}
|
|
1152
1302
|
registerDomainBlock(hostname) {
|
|
@@ -1217,6 +1367,7 @@ export class Spider {
|
|
|
1217
1367
|
}
|
|
1218
1368
|
abort() {
|
|
1219
1369
|
this.aborted = true;
|
|
1370
|
+
this.abortController.abort();
|
|
1220
1371
|
}
|
|
1221
1372
|
isRunning() {
|
|
1222
1373
|
return this.running;
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
import type { CrawlQueueAdapter, CrawlQueueItem } from './crawl-queue.js';
|
|
2
|
+
export declare class SqliteCrawlQueue implements CrawlQueueAdapter {
|
|
3
|
+
private db;
|
|
4
|
+
private stmts;
|
|
5
|
+
private constructor();
|
|
6
|
+
static create(opts?: {
|
|
7
|
+
dbPath?: string;
|
|
8
|
+
}): Promise<SqliteCrawlQueue>;
|
|
9
|
+
private ensureDb;
|
|
10
|
+
getDb(): any;
|
|
11
|
+
push(item: CrawlQueueItem): Promise<void>;
|
|
12
|
+
pushBatch(items: CrawlQueueItem[]): Promise<void>;
|
|
13
|
+
pop(): Promise<CrawlQueueItem | null>;
|
|
14
|
+
hasVisited(url: string): Promise<boolean>;
|
|
15
|
+
hasVisitedBatch(urls: string[]): Promise<Set<string>>;
|
|
16
|
+
markVisited(url: string): Promise<void>;
|
|
17
|
+
size(): Promise<number>;
|
|
18
|
+
clear(): Promise<void>;
|
|
19
|
+
close(): Promise<void>;
|
|
20
|
+
getVisitedSet(): Set<string>;
|
|
21
|
+
saveMetadata(key: string, value: string): void;
|
|
22
|
+
getMetadata(key: string): string | undefined;
|
|
23
|
+
getAllMetadata(): Record<string, string>;
|
|
24
|
+
}
|