recker 1.0.93 → 1.0.94-next.83dffd9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,10 +3,12 @@ export type { Options as ParserOptions } from './parser/index.js';
3
3
  export { ScrapeDocument } from './document.js';
4
4
  export { ScrapeElement } from './element.js';
5
5
  export { Spider, spider } from './spider.js';
6
- export type { SpiderOptions, SpiderPageResult, SpiderProgress, SpiderResult, } from './spider.js';
6
+ export type { SpiderOptions, SpiderPageResult, SpiderPageEvent, SpiderProgress, SpiderResult, } from './spider.js';
7
7
  export { InMemoryCrawlQueue } from './crawl-queue.js';
8
+ export { SqliteCrawlQueue } from './sqlite-crawl-queue.js';
8
9
  export type { CrawlQueueAdapter, CrawlQueueItem } from './crawl-queue.js';
9
10
  export { InMemoryCrawlStorage } from './crawl-storage.js';
11
+ export { SqliteCrawlStorage } from './sqlite-crawl-storage.js';
10
12
  export type { CrawlStorageAdapter } from './crawl-storage.js';
11
13
  export { ListProxyAdapter } from './proxy-adapter.js';
12
14
  export type { ProxyAdapter } from './proxy-adapter.js';
@@ -3,6 +3,8 @@ export { ScrapeDocument } from './document.js';
3
3
  export { ScrapeElement } from './element.js';
4
4
  export { Spider, spider } from './spider.js';
5
5
  export { InMemoryCrawlQueue } from './crawl-queue.js';
6
+ export { SqliteCrawlQueue } from './sqlite-crawl-queue.js';
6
7
  export { InMemoryCrawlStorage } from './crawl-storage.js';
8
+ export { SqliteCrawlStorage } from './sqlite-crawl-storage.js';
7
9
  export { ListProxyAdapter } from './proxy-adapter.js';
8
10
  export { extractLinks, extractImages, extractMeta, extractOpenGraph, extractTwitterCard, extractJsonLd, extractForms, extractTables, extractScripts, extractStyles, } from './extractors.js';
@@ -1,3 +1,4 @@
1
+ import { ScrapeDocument } from './document.js';
1
2
  import type { ExtractedLink, ExtractionSchema } from './types.js';
2
3
  import type { Options as ParserOptions } from './parser/index.js';
3
4
  import { type SitemapUrl } from '../seo/validators/sitemap.js';
@@ -31,8 +32,7 @@ export interface SpiderOptions {
31
32
  proxy?: string | string[] | ProxyAdapter;
32
33
  transport?: SpiderTransport;
33
34
  preferCurlFirst?: boolean;
34
- onPage?: (result: SpiderPageResult) => void;
35
- onPageWithHtml?: (result: SpiderPageResult, html: string) => void | Promise<void>;
35
+ onPage?: (event: SpiderPageEvent) => void | Promise<void>;
36
36
  onCaptchaDetected?: (result: {
37
37
  url: string;
38
38
  status: number;
@@ -40,9 +40,31 @@ export interface SpiderOptions {
40
40
  provider?: CaptchaProvider;
41
41
  usedCurl: boolean;
42
42
  }) => void | Promise<void>;
43
+ onBlocked?: (result: SpiderPageResult) => void | Promise<void>;
44
+ onError?: (result: SpiderPageResult) => void | Promise<void>;
45
+ onRetry?: (info: {
46
+ url: string;
47
+ attempt: number;
48
+ maxAttempts: number;
49
+ reason?: string;
50
+ delay: number;
51
+ transport: SpiderTransport;
52
+ previousStatus: number;
53
+ timings?: SpiderPageResult['timings'];
54
+ }) => void | Promise<void>;
55
+ onRedirect?: (info: {
56
+ from: string;
57
+ to: string;
58
+ status: number;
59
+ }) => void | Promise<void>;
43
60
  onProgress?: (progress: SpiderProgress) => void;
44
61
  extract?: string[] | ExtractionSchema;
45
62
  parserOptions?: Partial<ParserOptions>;
63
+ domainRateLimit?: {
64
+ maxPerSecond?: number;
65
+ };
66
+ deduplicateContent?: boolean;
67
+ resume?: boolean;
46
68
  crawlQueue?: CrawlQueueAdapter;
47
69
  crawlStorage?: CrawlStorageAdapter;
48
70
  }
@@ -102,6 +124,14 @@ export interface SpiderPageResult {
102
124
  stylesheets: number;
103
125
  };
104
126
  extracted?: Record<string, unknown>;
127
+ contentHash?: string;
128
+ isDuplicate?: boolean;
129
+ duplicateOf?: string;
130
+ }
131
+ export interface SpiderPageEvent {
132
+ result: SpiderPageResult;
133
+ html?: string;
134
+ document?: () => Promise<ScrapeDocument>;
105
135
  }
106
136
  export interface SpiderProgress {
107
137
  crawled: number;
@@ -162,7 +192,10 @@ export declare class Spider {
162
192
  private baseHost;
163
193
  private running;
164
194
  private aborted;
195
+ private abortController;
165
196
  private pendingCount;
197
+ private domainRequestTimestamps;
198
+ private contentHashes;
166
199
  private blockedDomains;
167
200
  private curlTransport;
168
201
  private curlAvailable;
@@ -172,6 +205,7 @@ export declare class Spider {
172
205
  private robotsData;
173
206
  private sitemapValidation;
174
207
  private robotsValidation;
208
+ private waitForDomainRateLimit;
175
209
  private toHeaderRecord;
176
210
  constructor(options?: SpiderOptions);
177
211
  crawl(startUrl: string): Promise<SpiderResult>;
@@ -1,3 +1,4 @@
1
+ import { createHash } from 'node:crypto';
1
2
  import { performance } from 'node:perf_hooks';
2
3
  import { createClient } from '../core/client.js';
3
4
  import { ScrapeDocument } from './document.js';
@@ -39,10 +40,15 @@ const SEC_CH_PLATFORM_HINTS = [
39
40
  function getHostname(url) {
40
41
  return new URL(url).hostname;
41
42
  }
42
- function sleep(ms) {
43
+ function sleep(ms, signal) {
43
44
  if (ms <= 0)
44
45
  return Promise.resolve();
45
- return new Promise(resolve => setTimeout(resolve, ms));
46
+ if (signal?.aborted)
47
+ return Promise.resolve();
48
+ return new Promise(resolve => {
49
+ const timer = setTimeout(resolve, ms);
50
+ signal?.addEventListener('abort', () => { clearTimeout(timer); resolve(); }, { once: true });
51
+ });
46
52
  }
47
53
  function getRetryAfterDelay(response) {
48
54
  const retryAfter = response.headers.get('retry-after');
@@ -108,7 +114,8 @@ function shouldCrawl(url, baseHost, options) {
108
114
  if (!['http:', 'https:'].includes(parsed.protocol)) {
109
115
  return false;
110
116
  }
111
- if (options.sameDomain !== false && parsed.hostname !== baseHost) {
117
+ const hostname = parsed.hostname.replace(/^www\./, '');
118
+ if (options.sameDomain !== false && hostname !== baseHost) {
112
119
  return false;
113
120
  }
114
121
  const skipExtensions = [
@@ -188,7 +195,10 @@ export class Spider {
188
195
  baseHost = '';
189
196
  running = false;
190
197
  aborted = false;
198
+ abortController = new AbortController();
191
199
  pendingCount = 0;
200
+ domainRequestTimestamps = new Map();
201
+ contentHashes = new Map();
192
202
  blockedDomains = new Set();
193
203
  curlTransport = null;
194
204
  curlAvailable = false;
@@ -198,6 +208,33 @@ export class Spider {
198
208
  robotsData = null;
199
209
  sitemapValidation = null;
200
210
  robotsValidation = null;
211
+ async waitForDomainRateLimit(hostname) {
212
+ const limit = this.options.domainRateLimit?.maxPerSecond;
213
+ if (!limit || limit <= 0)
214
+ return;
215
+ const now = Date.now();
216
+ const window = 1000;
217
+ let timestamps = this.domainRequestTimestamps.get(hostname);
218
+ if (!timestamps) {
219
+ timestamps = [];
220
+ this.domainRequestTimestamps.set(hostname, timestamps);
221
+ }
222
+ while (timestamps.length > 0 && timestamps[0] <= now - window) {
223
+ timestamps.shift();
224
+ }
225
+ if (timestamps.length >= limit) {
226
+ const waitMs = timestamps[0] + window - now;
227
+ if (waitMs > 0)
228
+ await sleep(waitMs, this.abortController.signal);
229
+ if (this.aborted)
230
+ return;
231
+ const afterWait = Date.now();
232
+ while (timestamps.length > 0 && timestamps[0] <= afterWait - window) {
233
+ timestamps.shift();
234
+ }
235
+ }
236
+ timestamps.push(Date.now());
237
+ }
201
238
  toHeaderRecord(headers) {
202
239
  const headerRecord = {};
203
240
  headers.forEach((value, key) => {
@@ -239,11 +276,17 @@ export class Spider {
239
276
  exclude: options.exclude,
240
277
  include: options.include,
241
278
  onPage: options.onPage,
242
- onPageWithHtml: options.onPageWithHtml,
243
279
  onCaptchaDetected: options.onCaptchaDetected,
280
+ onBlocked: options.onBlocked,
281
+ onError: options.onError,
282
+ onRetry: options.onRetry,
283
+ onRedirect: options.onRedirect,
244
284
  onProgress: options.onProgress,
245
285
  extract: extractSchema,
246
286
  parserOptions: options.parserOptions,
287
+ domainRateLimit: options.domainRateLimit,
288
+ deduplicateContent: options.deduplicateContent ?? false,
289
+ resume: options.resume ?? false,
247
290
  };
248
291
  if (options.proxy) {
249
292
  if (typeof options.proxy === 'string') {
@@ -281,22 +324,41 @@ export class Spider {
281
324
  const startTimestamp = Date.now();
282
325
  const normalizedStart = normalizeUrl(startUrl);
283
326
  const baseUrl = new URL(normalizedStart).origin;
284
- this.baseHost = new URL(normalizedStart).hostname;
285
- await this.crawlQueue.clear();
286
- await this.crawlStorage.clear();
287
- this._visitedCount = 0;
288
- this._queueSize = 0;
289
- this._resultCount = 0;
327
+ this.baseHost = new URL(normalizedStart).hostname.replace(/^www\./, '');
328
+ if (!this.options.resume) {
329
+ await this.crawlQueue.clear();
330
+ await this.crawlStorage.clear();
331
+ this._visitedCount = 0;
332
+ this._queueSize = 0;
333
+ this._resultCount = 0;
334
+ this.domainRequestTimestamps.clear();
335
+ this.contentHashes.clear();
336
+ }
337
+ else {
338
+ this._queueSize = await this.crawlQueue.size();
339
+ this._resultCount = await this.crawlStorage.getResultCount();
340
+ }
290
341
  this.running = true;
291
342
  this.aborted = false;
343
+ this.abortController = new AbortController();
292
344
  this.pendingCount = 0;
293
345
  this.sitemapUrls = [];
294
346
  this.sitemapUrlSet.clear();
295
347
  this.robotsData = null;
296
348
  this.sitemapValidation = null;
297
349
  this.robotsValidation = null;
298
- this.blockedDomains.clear();
299
- this.domainStates.clear();
350
+ if (!this.options.resume) {
351
+ this.blockedDomains.clear();
352
+ this.domainStates.clear();
353
+ }
354
+ if (this.options.resume && this.options.deduplicateContent) {
355
+ const existingResults = await this.crawlStorage.getResults();
356
+ for (const r of existingResults) {
357
+ if (r.contentHash) {
358
+ this.contentHashes.set(r.contentHash, r.url);
359
+ }
360
+ }
361
+ }
300
362
  if (this.options.transport !== 'undici') {
301
363
  this.curlAvailable = await hasImpersonate();
302
364
  if (this.options.transport === 'auto' && this.curlAvailable && isProtectedDomain(this.baseHost)) {
@@ -312,10 +374,12 @@ export class Spider {
312
374
  const pending = new Map();
313
375
  const scheduleUrl = async (item) => {
314
376
  const normalized = normalizeUrl(item.url);
315
- if (await this.crawlQueue.hasVisited(normalized))
316
- return;
317
377
  if (pending.has(normalized))
318
378
  return;
379
+ if (await this.crawlQueue.hasVisited(normalized))
380
+ return;
381
+ await this.crawlQueue.markVisited(normalized);
382
+ this._visitedCount++;
319
383
  if (item.depth > this.options.maxDepth)
320
384
  return;
321
385
  if (this._resultCount + pending.size >= this.options.maxPages)
@@ -331,8 +395,6 @@ export class Spider {
331
395
  return;
332
396
  }
333
397
  }
334
- await this.crawlQueue.markVisited(normalized);
335
- this._visitedCount++;
336
398
  this.pendingCount++;
337
399
  const promise = this.pool.run(() => this.crawlPage({ ...item, url: normalized }))
338
400
  .finally(() => {
@@ -341,16 +403,18 @@ export class Spider {
341
403
  });
342
404
  pending.set(normalized, promise);
343
405
  };
344
- await scheduleUrl({ url: normalizedStart, depth: 0 });
345
- if (this.options.useSitemap && this.sitemapUrls.length > 0) {
346
- for (const sitemapUrl of this.sitemapUrls) {
347
- try {
348
- const urlHost = new URL(sitemapUrl.loc).hostname;
349
- if (urlHost === this.baseHost) {
350
- await scheduleUrl({ url: sitemapUrl.loc, depth: 1 });
406
+ if (!this.options.resume) {
407
+ await scheduleUrl({ url: normalizedStart, depth: 0 });
408
+ if (this.options.useSitemap && this.sitemapUrls.length > 0) {
409
+ for (const sitemapUrl of this.sitemapUrls) {
410
+ try {
411
+ const urlHost = new URL(sitemapUrl.loc).hostname;
412
+ if (urlHost === this.baseHost) {
413
+ await scheduleUrl({ url: sitemapUrl.loc, depth: 1 });
414
+ }
415
+ }
416
+ catch {
351
417
  }
352
- }
353
- catch {
354
418
  }
355
419
  }
356
420
  }
@@ -372,14 +436,33 @@ export class Spider {
372
436
  if (pending.size > 0) {
373
437
  await Promise.all(pending.values());
374
438
  }
439
+ while (!this.aborted && this._resultCount < this.options.maxPages) {
440
+ const remaining = await this.crawlQueue.size();
441
+ if (remaining === 0 && pending.size === 0)
442
+ break;
443
+ this._queueSize = remaining;
444
+ let nextItem = await this.crawlQueue.pop();
445
+ while (nextItem && !this.aborted) {
446
+ this._queueSize = Math.max(0, this._queueSize - 1);
447
+ if (this._resultCount + pending.size >= this.options.maxPages)
448
+ break;
449
+ await scheduleUrl(nextItem);
450
+ nextItem = await this.crawlQueue.pop();
451
+ }
452
+ if (pending.size > 0) {
453
+ await Promise.all(pending.values());
454
+ }
455
+ }
375
456
  this.running = false;
376
457
  const pages = await this.crawlStorage.getResults();
377
458
  const errors = await this.crawlStorage.getErrors();
378
459
  const sitemapAnalysis = this.buildSitemapAnalysis(pages);
379
460
  const robotsAnalysis = this.buildRobotsAnalysis();
380
461
  const visited = this.crawlQueue instanceof InMemoryCrawlQueue
381
- ? this.crawlQueue.getVisited()
462
+ ? new Set(this.crawlQueue.getVisited())
382
463
  : new Set(pages.map(r => r.url));
464
+ await this.crawlQueue.close?.();
465
+ await this.crawlStorage.close?.();
383
466
  return {
384
467
  startUrl: normalizedStart,
385
468
  pages,
@@ -571,6 +654,8 @@ export class Spider {
571
654
  let lastRetryAfterMs = 0;
572
655
  const proxyUrl = this.proxyAdapter ? await this.proxyAdapter.getProxy() : null;
573
656
  const executeRequest = async (useCurl) => {
657
+ if (this.aborted)
658
+ throw new Error('Crawl aborted');
574
659
  if (useCurl && this.curlTransport) {
575
660
  const curlForRequest = proxyUrl
576
661
  ? new CurlTransport(proxyUrl)
@@ -611,6 +696,10 @@ export class Spider {
611
696
  const clientForRequest = this.getClientForProxy(proxyUrl);
612
697
  const response = await clientForRequest.get(url, {
613
698
  headers: this.buildRequestHeaders(url, false),
699
+ signal: this.abortController.signal,
700
+ beforeRedirect: this.options.onRedirect
701
+ ? (info) => { this.options.onRedirect({ from: info.from, to: info.to, status: info.status }); }
702
+ : undefined,
614
703
  });
615
704
  const contentType = response.headers.get('content-type') || '';
616
705
  const shouldReadUndiciBody = !contentType ||
@@ -643,6 +732,7 @@ export class Spider {
643
732
  };
644
733
  };
645
734
  for (let attempt = 0; attempt < maxAttempts; attempt++) {
735
+ await this.waitForDomainRateLimit(hostname);
646
736
  await this.waitForDomainPenalty(hostname);
647
737
  const useCurl = this.shouldUseCurlForHost(hostname, hasCurl, forcedTransport);
648
738
  const transportForAttempt = useCurl ? 'curl' : 'undici';
@@ -732,7 +822,21 @@ export class Spider {
732
822
  forcedTransport = 'curl';
733
823
  }
734
824
  }
735
- await sleep(waitMs);
825
+ if (this.options.onRetry) {
826
+ await this.options.onRetry({
827
+ url,
828
+ attempt: attempt + 1,
829
+ maxAttempts,
830
+ reason: attemptReason,
831
+ delay: waitMs,
832
+ transport: forcedTransport ?? transportForAttempt,
833
+ previousStatus: response.status,
834
+ timings,
835
+ });
836
+ }
837
+ if (this.aborted)
838
+ break;
839
+ await sleep(waitMs, this.abortController.signal);
736
840
  continue;
737
841
  }
738
842
  catch (error) {
@@ -763,8 +867,10 @@ export class Spider {
763
867
  forcedTransport = 'curl';
764
868
  }
765
869
  }
870
+ if (this.aborted)
871
+ break;
766
872
  const waitMs = this.getRetryWait(hostname, attempt + 1);
767
- await sleep(waitMs);
873
+ await sleep(waitMs, this.abortController.signal);
768
874
  }
769
875
  }
770
876
  if (lastResponse) {
@@ -867,7 +973,21 @@ export class Spider {
867
973
  };
868
974
  await this.crawlStorage.saveResult(nonHtmlResult);
869
975
  this._resultCount++;
870
- this.options.onPage?.(nonHtmlResult);
976
+ if (this.options.onPage) {
977
+ let cachedDoc = null;
978
+ await this.options.onPage({
979
+ result: nonHtmlResult,
980
+ html: html || undefined,
981
+ document: html ? () => {
982
+ if (cachedDoc)
983
+ return Promise.resolve(cachedDoc);
984
+ return ScrapeDocument.create(html, { baseUrl: item.url, parserOptions: this.options.parserOptions }).then(d => { cachedDoc = d; return d; });
985
+ } : undefined,
986
+ });
987
+ }
988
+ if (this.options.onBlocked && (detection.blocked || hasCaptcha)) {
989
+ await this.options.onBlocked(nonHtmlResult);
990
+ }
871
991
  return;
872
992
  }
873
993
  const doc = await ScrapeDocument.create(html, {
@@ -898,6 +1018,21 @@ export class Spider {
898
1018
  catch {
899
1019
  }
900
1020
  }
1021
+ let isDuplicate = false;
1022
+ let duplicateOf;
1023
+ let contentHash;
1024
+ if (this.options.deduplicateContent) {
1025
+ const bodyText = doc.text('body');
1026
+ contentHash = createHash('md5').update(bodyText).digest('hex');
1027
+ const existingUrl = this.contentHashes.get(contentHash);
1028
+ if (existingUrl) {
1029
+ isDuplicate = true;
1030
+ duplicateOf = existingUrl;
1031
+ }
1032
+ else {
1033
+ this.contentHashes.set(contentHash, item.url);
1034
+ }
1035
+ }
901
1036
  const result = {
902
1037
  url: item.url,
903
1038
  status,
@@ -926,42 +1061,52 @@ export class Spider {
926
1061
  timings,
927
1062
  fetchedAt,
928
1063
  extracted,
1064
+ contentHash,
1065
+ isDuplicate: isDuplicate || undefined,
1066
+ duplicateOf,
929
1067
  };
930
1068
  await this.crawlStorage.saveResult(result);
931
1069
  this._resultCount++;
932
- this.options.onPage?.(result);
933
- if (this.options.onPageWithHtml) {
934
- await this.options.onPageWithHtml(result, html);
935
- }
936
- const candidates = [];
937
- const candidateUrls = [];
938
- for (const link of links) {
939
- if (!link.href)
940
- continue;
941
- const normalized = normalizeUrl(link.href);
942
- if (!shouldCrawl(normalized, this.baseHost, this.options))
943
- continue;
944
- candidateUrls.push(normalized);
945
- candidates.push({ url: normalized, depth: item.depth + 1 });
1070
+ if (this.options.onPage) {
1071
+ await this.options.onPage({
1072
+ result,
1073
+ html,
1074
+ document: () => Promise.resolve(doc),
1075
+ });
946
1076
  }
947
- if (candidates.length > 0) {
948
- const visitedSet = this.crawlQueue.hasVisitedBatch
949
- ? await this.crawlQueue.hasVisitedBatch(candidateUrls)
950
- : new Set(await Promise.all(candidateUrls.map(async (u) => (await this.crawlQueue.hasVisited(u)) ? u : null)).then(r => r.filter(Boolean)));
951
- const newItems = candidates.filter((_, i) => !visitedSet.has(candidateUrls[i]));
952
- if (newItems.length > 0) {
953
- if (this.crawlQueue.pushBatch) {
954
- await this.crawlQueue.pushBatch(newItems);
955
- }
956
- else {
957
- for (const newItem of newItems)
958
- await this.crawlQueue.push(newItem);
1077
+ if (!isDuplicate) {
1078
+ const candidates = [];
1079
+ const candidateUrls = [];
1080
+ for (const link of links) {
1081
+ if (!link.href)
1082
+ continue;
1083
+ const normalized = normalizeUrl(link.href);
1084
+ if (!shouldCrawl(normalized, this.baseHost, this.options))
1085
+ continue;
1086
+ candidateUrls.push(normalized);
1087
+ candidates.push({ url: normalized, depth: item.depth + 1 });
1088
+ }
1089
+ if (candidates.length > 0) {
1090
+ const visitedSet = this.crawlQueue.hasVisitedBatch
1091
+ ? await this.crawlQueue.hasVisitedBatch(candidateUrls)
1092
+ : new Set(await Promise.all(candidateUrls.map(async (u) => (await this.crawlQueue.hasVisited(u)) ? u : null)).then(r => r.filter(Boolean)));
1093
+ const newItems = candidates.filter((_, i) => !visitedSet.has(candidateUrls[i]));
1094
+ if (newItems.length > 0) {
1095
+ if (this.crawlQueue.pushBatch) {
1096
+ await this.crawlQueue.pushBatch(newItems);
1097
+ }
1098
+ else {
1099
+ for (const newItem of newItems)
1100
+ await this.crawlQueue.push(newItem);
1101
+ }
1102
+ this._queueSize += newItems.length;
959
1103
  }
960
- this._queueSize += newItems.length;
961
1104
  }
962
1105
  }
963
1106
  }
964
1107
  catch (error) {
1108
+ if (this.aborted)
1109
+ return;
965
1110
  const errAttempts = typeof error?.attempts === 'number' ? error.attempts : 1;
966
1111
  const errRetryCount = typeof error?.retryCount === 'number' ? error.retryCount : Math.max(0, errAttempts - 1);
967
1112
  const errRetryAfterMs = typeof error?.retryAfterMs === 'number' ? error?.retryAfterMs : 0;
@@ -1026,7 +1171,12 @@ export class Spider {
1026
1171
  await this.crawlStorage.saveResult(errorResult);
1027
1172
  this._resultCount++;
1028
1173
  await this.crawlStorage.saveError({ url: item.url, error: message });
1029
- this.options.onPage?.(errorResult);
1174
+ if (this.options.onPage) {
1175
+ await this.options.onPage({ result: errorResult });
1176
+ }
1177
+ if (this.options.onError) {
1178
+ await this.options.onError(errorResult);
1179
+ }
1030
1180
  }
1031
1181
  }
1032
1182
  getOrCreateDomainState(hostname) {
@@ -1146,7 +1296,7 @@ export class Spider {
1146
1296
  const now = Date.now();
1147
1297
  const delay = Math.max(state.penaltyUntil, state.challengeCooldownUntil) - now;
1148
1298
  if (delay > 0) {
1149
- await sleep(delay);
1299
+ await sleep(delay, this.abortController.signal);
1150
1300
  }
1151
1301
  }
1152
1302
  registerDomainBlock(hostname) {
@@ -1217,6 +1367,7 @@ export class Spider {
1217
1367
  }
1218
1368
  abort() {
1219
1369
  this.aborted = true;
1370
+ this.abortController.abort();
1220
1371
  }
1221
1372
  isRunning() {
1222
1373
  return this.running;
@@ -0,0 +1,24 @@
1
+ import type { CrawlQueueAdapter, CrawlQueueItem } from './crawl-queue.js';
2
+ export declare class SqliteCrawlQueue implements CrawlQueueAdapter {
3
+ private db;
4
+ private stmts;
5
+ private constructor();
6
+ static create(opts?: {
7
+ dbPath?: string;
8
+ }): Promise<SqliteCrawlQueue>;
9
+ private ensureDb;
10
+ getDb(): any;
11
+ push(item: CrawlQueueItem): Promise<void>;
12
+ pushBatch(items: CrawlQueueItem[]): Promise<void>;
13
+ pop(): Promise<CrawlQueueItem | null>;
14
+ hasVisited(url: string): Promise<boolean>;
15
+ hasVisitedBatch(urls: string[]): Promise<Set<string>>;
16
+ markVisited(url: string): Promise<void>;
17
+ size(): Promise<number>;
18
+ clear(): Promise<void>;
19
+ close(): Promise<void>;
20
+ getVisitedSet(): Set<string>;
21
+ saveMetadata(key: string, value: string): void;
22
+ getMetadata(key: string): string | undefined;
23
+ getAllMetadata(): Record<string, string>;
24
+ }