recker 1.0.94 → 1.0.95

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -17,6 +17,8 @@ export interface CrawlQueueAdapter {
17
17
  }
18
18
  export declare class InMemoryCrawlQueue implements CrawlQueueAdapter {
19
19
  private queue;
20
+ private head;
21
+ private tail;
20
22
  private visited;
21
23
  push(item: CrawlQueueItem): Promise<void>;
22
24
  pushBatch(items: CrawlQueueItem[]): Promise<void>;
@@ -1,14 +1,31 @@
1
1
  export class InMemoryCrawlQueue {
2
2
  queue = [];
3
+ head = 0;
4
+ tail = 0;
3
5
  visited = new Set();
4
6
  async push(item) {
5
- this.queue.push(item);
7
+ this.queue[this.tail++] = item;
6
8
  }
7
9
  async pushBatch(items) {
8
- this.queue.push(...items);
10
+ for (const item of items) {
11
+ this.queue[this.tail++] = item;
12
+ }
9
13
  }
10
14
  async pop() {
11
- return this.queue.shift() ?? null;
15
+ while (this.head < this.tail) {
16
+ const item = this.queue[this.head];
17
+ this.queue[this.head] = undefined;
18
+ this.head++;
19
+ if (item) {
20
+ if (this.head > 1024 && this.head > this.tail / 2) {
21
+ this.queue = this.queue.slice(this.head);
22
+ this.tail -= this.head;
23
+ this.head = 0;
24
+ }
25
+ return item;
26
+ }
27
+ }
28
+ return null;
12
29
  }
13
30
  async hasVisited(url) {
14
31
  return this.visited.has(url);
@@ -25,10 +42,12 @@ export class InMemoryCrawlQueue {
25
42
  this.visited.add(url);
26
43
  }
27
44
  async size() {
28
- return this.queue.length;
45
+ return this.tail - this.head;
29
46
  }
30
47
  async clear() {
31
48
  this.queue = [];
49
+ this.head = 0;
50
+ this.tail = 0;
32
51
  this.visited.clear();
33
52
  }
34
53
  async close() {
@@ -192,6 +192,7 @@ export declare class Spider {
192
192
  private baseHost;
193
193
  private running;
194
194
  private aborted;
195
+ private abortController;
195
196
  private pendingCount;
196
197
  private domainRequestTimestamps;
197
198
  private contentHashes;
@@ -40,10 +40,15 @@ const SEC_CH_PLATFORM_HINTS = [
40
40
  function getHostname(url) {
41
41
  return new URL(url).hostname;
42
42
  }
43
- function sleep(ms) {
43
+ function sleep(ms, signal) {
44
44
  if (ms <= 0)
45
45
  return Promise.resolve();
46
- return new Promise(resolve => setTimeout(resolve, ms));
46
+ if (signal?.aborted)
47
+ return Promise.resolve();
48
+ return new Promise(resolve => {
49
+ const timer = setTimeout(resolve, ms);
50
+ signal?.addEventListener('abort', () => { clearTimeout(timer); resolve(); }, { once: true });
51
+ });
47
52
  }
48
53
  function getRetryAfterDelay(response) {
49
54
  const retryAfter = response.headers.get('retry-after');
@@ -109,7 +114,8 @@ function shouldCrawl(url, baseHost, options) {
109
114
  if (!['http:', 'https:'].includes(parsed.protocol)) {
110
115
  return false;
111
116
  }
112
- if (options.sameDomain !== false && parsed.hostname !== baseHost) {
117
+ const hostname = parsed.hostname.replace(/^www\./, '');
118
+ if (options.sameDomain !== false && hostname !== baseHost) {
113
119
  return false;
114
120
  }
115
121
  const skipExtensions = [
@@ -189,6 +195,7 @@ export class Spider {
189
195
  baseHost = '';
190
196
  running = false;
191
197
  aborted = false;
198
+ abortController = new AbortController();
192
199
  pendingCount = 0;
193
200
  domainRequestTimestamps = new Map();
194
201
  contentHashes = new Map();
@@ -218,7 +225,9 @@ export class Spider {
218
225
  if (timestamps.length >= limit) {
219
226
  const waitMs = timestamps[0] + window - now;
220
227
  if (waitMs > 0)
221
- await sleep(waitMs);
228
+ await sleep(waitMs, this.abortController.signal);
229
+ if (this.aborted)
230
+ return;
222
231
  const afterWait = Date.now();
223
232
  while (timestamps.length > 0 && timestamps[0] <= afterWait - window) {
224
233
  timestamps.shift();
@@ -315,7 +324,7 @@ export class Spider {
315
324
  const startTimestamp = Date.now();
316
325
  const normalizedStart = normalizeUrl(startUrl);
317
326
  const baseUrl = new URL(normalizedStart).origin;
318
- this.baseHost = new URL(normalizedStart).hostname;
327
+ this.baseHost = new URL(normalizedStart).hostname.replace(/^www\./, '');
319
328
  if (!this.options.resume) {
320
329
  await this.crawlQueue.clear();
321
330
  await this.crawlStorage.clear();
@@ -331,14 +340,25 @@ export class Spider {
331
340
  }
332
341
  this.running = true;
333
342
  this.aborted = false;
343
+ this.abortController = new AbortController();
334
344
  this.pendingCount = 0;
335
345
  this.sitemapUrls = [];
336
346
  this.sitemapUrlSet.clear();
337
347
  this.robotsData = null;
338
348
  this.sitemapValidation = null;
339
349
  this.robotsValidation = null;
340
- this.blockedDomains.clear();
341
- this.domainStates.clear();
350
+ if (!this.options.resume) {
351
+ this.blockedDomains.clear();
352
+ this.domainStates.clear();
353
+ }
354
+ if (this.options.resume && this.options.deduplicateContent) {
355
+ const existingResults = await this.crawlStorage.getResults();
356
+ for (const r of existingResults) {
357
+ if (r.contentHash) {
358
+ this.contentHashes.set(r.contentHash, r.url);
359
+ }
360
+ }
361
+ }
342
362
  if (this.options.transport !== 'undici') {
343
363
  this.curlAvailable = await hasImpersonate();
344
364
  if (this.options.transport === 'auto' && this.curlAvailable && isProtectedDomain(this.baseHost)) {
@@ -354,10 +374,12 @@ export class Spider {
354
374
  const pending = new Map();
355
375
  const scheduleUrl = async (item) => {
356
376
  const normalized = normalizeUrl(item.url);
357
- if (await this.crawlQueue.hasVisited(normalized))
358
- return;
359
377
  if (pending.has(normalized))
360
378
  return;
379
+ if (await this.crawlQueue.hasVisited(normalized))
380
+ return;
381
+ await this.crawlQueue.markVisited(normalized);
382
+ this._visitedCount++;
361
383
  if (item.depth > this.options.maxDepth)
362
384
  return;
363
385
  if (this._resultCount + pending.size >= this.options.maxPages)
@@ -373,8 +395,6 @@ export class Spider {
373
395
  return;
374
396
  }
375
397
  }
376
- await this.crawlQueue.markVisited(normalized);
377
- this._visitedCount++;
378
398
  this.pendingCount++;
379
399
  const promise = this.pool.run(() => this.crawlPage({ ...item, url: normalized }))
380
400
  .finally(() => {
@@ -416,14 +436,33 @@ export class Spider {
416
436
  if (pending.size > 0) {
417
437
  await Promise.all(pending.values());
418
438
  }
439
+ while (!this.aborted && this._resultCount < this.options.maxPages) {
440
+ const remaining = await this.crawlQueue.size();
441
+ if (remaining === 0 && pending.size === 0)
442
+ break;
443
+ this._queueSize = remaining;
444
+ let nextItem = await this.crawlQueue.pop();
445
+ while (nextItem && !this.aborted) {
446
+ this._queueSize = Math.max(0, this._queueSize - 1);
447
+ if (this._resultCount + pending.size >= this.options.maxPages)
448
+ break;
449
+ await scheduleUrl(nextItem);
450
+ nextItem = await this.crawlQueue.pop();
451
+ }
452
+ if (pending.size > 0) {
453
+ await Promise.all(pending.values());
454
+ }
455
+ }
419
456
  this.running = false;
420
457
  const pages = await this.crawlStorage.getResults();
421
458
  const errors = await this.crawlStorage.getErrors();
422
459
  const sitemapAnalysis = this.buildSitemapAnalysis(pages);
423
460
  const robotsAnalysis = this.buildRobotsAnalysis();
424
461
  const visited = this.crawlQueue instanceof InMemoryCrawlQueue
425
- ? this.crawlQueue.getVisited()
462
+ ? new Set(this.crawlQueue.getVisited())
426
463
  : new Set(pages.map(r => r.url));
464
+ await this.crawlQueue.close?.();
465
+ await this.crawlStorage.close?.();
427
466
  return {
428
467
  startUrl: normalizedStart,
429
468
  pages,
@@ -615,6 +654,8 @@ export class Spider {
615
654
  let lastRetryAfterMs = 0;
616
655
  const proxyUrl = this.proxyAdapter ? await this.proxyAdapter.getProxy() : null;
617
656
  const executeRequest = async (useCurl) => {
657
+ if (this.aborted)
658
+ throw new Error('Crawl aborted');
618
659
  if (useCurl && this.curlTransport) {
619
660
  const curlForRequest = proxyUrl
620
661
  ? new CurlTransport(proxyUrl)
@@ -655,6 +696,7 @@ export class Spider {
655
696
  const clientForRequest = this.getClientForProxy(proxyUrl);
656
697
  const response = await clientForRequest.get(url, {
657
698
  headers: this.buildRequestHeaders(url, false),
699
+ signal: this.abortController.signal,
658
700
  beforeRedirect: this.options.onRedirect
659
701
  ? (info) => { this.options.onRedirect({ from: info.from, to: info.to, status: info.status }); }
660
702
  : undefined,
@@ -792,7 +834,9 @@ export class Spider {
792
834
  timings,
793
835
  });
794
836
  }
795
- await sleep(waitMs);
837
+ if (this.aborted)
838
+ break;
839
+ await sleep(waitMs, this.abortController.signal);
796
840
  continue;
797
841
  }
798
842
  catch (error) {
@@ -823,8 +867,10 @@ export class Spider {
823
867
  forcedTransport = 'curl';
824
868
  }
825
869
  }
870
+ if (this.aborted)
871
+ break;
826
872
  const waitMs = this.getRetryWait(hostname, attempt + 1);
827
- await sleep(waitMs);
873
+ await sleep(waitMs, this.abortController.signal);
828
874
  }
829
875
  }
830
876
  if (lastResponse) {
@@ -1059,6 +1105,8 @@ export class Spider {
1059
1105
  }
1060
1106
  }
1061
1107
  catch (error) {
1108
+ if (this.aborted)
1109
+ return;
1062
1110
  const errAttempts = typeof error?.attempts === 'number' ? error.attempts : 1;
1063
1111
  const errRetryCount = typeof error?.retryCount === 'number' ? error.retryCount : Math.max(0, errAttempts - 1);
1064
1112
  const errRetryAfterMs = typeof error?.retryAfterMs === 'number' ? error?.retryAfterMs : 0;
@@ -1248,7 +1296,7 @@ export class Spider {
1248
1296
  const now = Date.now();
1249
1297
  const delay = Math.max(state.penaltyUntil, state.challengeCooldownUntil) - now;
1250
1298
  if (delay > 0) {
1251
- await sleep(delay);
1299
+ await sleep(delay, this.abortController.signal);
1252
1300
  }
1253
1301
  }
1254
1302
  registerDomainBlock(hostname) {
@@ -1319,6 +1367,7 @@ export class Spider {
1319
1367
  }
1320
1368
  abort() {
1321
1369
  this.aborted = true;
1370
+ this.abortController.abort();
1322
1371
  }
1323
1372
  isRunning() {
1324
1373
  return this.running;
@@ -56,7 +56,7 @@ export class SeoSpider {
56
56
  });
57
57
  }
58
58
  async analyzePageDuringCrawl(pageResult, html) {
59
- if (pageResult.error || pageResult.status >= 400) {
59
+ if (pageResult.status >= 400 || !html) {
60
60
  const seoPage = { ...pageResult, seoReport: undefined };
61
61
  this.seoPages.push(seoPage);
62
62
  return;
@@ -517,6 +517,12 @@ export function detectBlock(response, body) {
517
517
  description: 'DataDome headers detected',
518
518
  });
519
519
  }
520
+ if (response.status === 200 &&
521
+ results.length === 0 &&
522
+ !location &&
523
+ !body) {
524
+ return { blocked: false, confidence: 0 };
525
+ }
520
526
  if (body) {
521
527
  const isLongBody = body.length > 100_000;
522
528
  const checkBody = isLongBody ? body.slice(0, 30_000) : body;
@@ -588,6 +594,12 @@ export function detectCaptcha(response, body) {
588
594
  addMatch(matches, 'cloudflare', 0.7, 'Cloudflare challenge headers detected');
589
595
  }
590
596
  }
597
+ if (matches.length === 0 &&
598
+ response.status >= 200 && response.status < 300 &&
599
+ !location &&
600
+ !body) {
601
+ return { detected: false, confidence: 0 };
602
+ }
591
603
  if (body) {
592
604
  const checkBody = body.length < 120_000 ? body : body.slice(0, 120_000);
593
605
  const isTinyBody = checkBody.length > 0 && checkBody.length < 12000;
@@ -17,6 +17,8 @@ export interface CrawlQueueAdapter {
17
17
  }
18
18
  export declare class InMemoryCrawlQueue implements CrawlQueueAdapter {
19
19
  private queue;
20
+ private head;
21
+ private tail;
20
22
  private visited;
21
23
  push(item: CrawlQueueItem): Promise<void>;
22
24
  pushBatch(items: CrawlQueueItem[]): Promise<void>;
@@ -1,14 +1,31 @@
1
1
  export class InMemoryCrawlQueue {
2
2
  queue = [];
3
+ head = 0;
4
+ tail = 0;
3
5
  visited = new Set();
4
6
  async push(item) {
5
- this.queue.push(item);
7
+ this.queue[this.tail++] = item;
6
8
  }
7
9
  async pushBatch(items) {
8
- this.queue.push(...items);
10
+ for (const item of items) {
11
+ this.queue[this.tail++] = item;
12
+ }
9
13
  }
10
14
  async pop() {
11
- return this.queue.shift() ?? null;
15
+ while (this.head < this.tail) {
16
+ const item = this.queue[this.head];
17
+ this.queue[this.head] = undefined;
18
+ this.head++;
19
+ if (item) {
20
+ if (this.head > 1024 && this.head > this.tail / 2) {
21
+ this.queue = this.queue.slice(this.head);
22
+ this.tail -= this.head;
23
+ this.head = 0;
24
+ }
25
+ return item;
26
+ }
27
+ }
28
+ return null;
12
29
  }
13
30
  async hasVisited(url) {
14
31
  return this.visited.has(url);
@@ -25,10 +42,12 @@ export class InMemoryCrawlQueue {
25
42
  this.visited.add(url);
26
43
  }
27
44
  async size() {
28
- return this.queue.length;
45
+ return this.tail - this.head;
29
46
  }
30
47
  async clear() {
31
48
  this.queue = [];
49
+ this.head = 0;
50
+ this.tail = 0;
32
51
  this.visited.clear();
33
52
  }
34
53
  async close() {
@@ -192,6 +192,7 @@ export declare class Spider {
192
192
  private baseHost;
193
193
  private running;
194
194
  private aborted;
195
+ private abortController;
195
196
  private pendingCount;
196
197
  private domainRequestTimestamps;
197
198
  private contentHashes;
@@ -40,10 +40,15 @@ const SEC_CH_PLATFORM_HINTS = [
40
40
  function getHostname(url) {
41
41
  return new URL(url).hostname;
42
42
  }
43
- function sleep(ms) {
43
+ function sleep(ms, signal) {
44
44
  if (ms <= 0)
45
45
  return Promise.resolve();
46
- return new Promise(resolve => setTimeout(resolve, ms));
46
+ if (signal?.aborted)
47
+ return Promise.resolve();
48
+ return new Promise(resolve => {
49
+ const timer = setTimeout(resolve, ms);
50
+ signal?.addEventListener('abort', () => { clearTimeout(timer); resolve(); }, { once: true });
51
+ });
47
52
  }
48
53
  function getRetryAfterDelay(response) {
49
54
  const retryAfter = response.headers.get('retry-after');
@@ -109,7 +114,8 @@ function shouldCrawl(url, baseHost, options) {
109
114
  if (!['http:', 'https:'].includes(parsed.protocol)) {
110
115
  return false;
111
116
  }
112
- if (options.sameDomain !== false && parsed.hostname !== baseHost) {
117
+ const hostname = parsed.hostname.replace(/^www\./, '');
118
+ if (options.sameDomain !== false && hostname !== baseHost) {
113
119
  return false;
114
120
  }
115
121
  const skipExtensions = [
@@ -189,6 +195,7 @@ export class Spider {
189
195
  baseHost = '';
190
196
  running = false;
191
197
  aborted = false;
198
+ abortController = new AbortController();
192
199
  pendingCount = 0;
193
200
  domainRequestTimestamps = new Map();
194
201
  contentHashes = new Map();
@@ -218,7 +225,9 @@ export class Spider {
218
225
  if (timestamps.length >= limit) {
219
226
  const waitMs = timestamps[0] + window - now;
220
227
  if (waitMs > 0)
221
- await sleep(waitMs);
228
+ await sleep(waitMs, this.abortController.signal);
229
+ if (this.aborted)
230
+ return;
222
231
  const afterWait = Date.now();
223
232
  while (timestamps.length > 0 && timestamps[0] <= afterWait - window) {
224
233
  timestamps.shift();
@@ -315,7 +324,7 @@ export class Spider {
315
324
  const startTimestamp = Date.now();
316
325
  const normalizedStart = normalizeUrl(startUrl);
317
326
  const baseUrl = new URL(normalizedStart).origin;
318
- this.baseHost = new URL(normalizedStart).hostname;
327
+ this.baseHost = new URL(normalizedStart).hostname.replace(/^www\./, '');
319
328
  if (!this.options.resume) {
320
329
  await this.crawlQueue.clear();
321
330
  await this.crawlStorage.clear();
@@ -331,14 +340,25 @@ export class Spider {
331
340
  }
332
341
  this.running = true;
333
342
  this.aborted = false;
343
+ this.abortController = new AbortController();
334
344
  this.pendingCount = 0;
335
345
  this.sitemapUrls = [];
336
346
  this.sitemapUrlSet.clear();
337
347
  this.robotsData = null;
338
348
  this.sitemapValidation = null;
339
349
  this.robotsValidation = null;
340
- this.blockedDomains.clear();
341
- this.domainStates.clear();
350
+ if (!this.options.resume) {
351
+ this.blockedDomains.clear();
352
+ this.domainStates.clear();
353
+ }
354
+ if (this.options.resume && this.options.deduplicateContent) {
355
+ const existingResults = await this.crawlStorage.getResults();
356
+ for (const r of existingResults) {
357
+ if (r.contentHash) {
358
+ this.contentHashes.set(r.contentHash, r.url);
359
+ }
360
+ }
361
+ }
342
362
  if (this.options.transport !== 'undici') {
343
363
  this.curlAvailable = await hasImpersonate();
344
364
  if (this.options.transport === 'auto' && this.curlAvailable && isProtectedDomain(this.baseHost)) {
@@ -354,10 +374,12 @@ export class Spider {
354
374
  const pending = new Map();
355
375
  const scheduleUrl = async (item) => {
356
376
  const normalized = normalizeUrl(item.url);
357
- if (await this.crawlQueue.hasVisited(normalized))
358
- return;
359
377
  if (pending.has(normalized))
360
378
  return;
379
+ if (await this.crawlQueue.hasVisited(normalized))
380
+ return;
381
+ await this.crawlQueue.markVisited(normalized);
382
+ this._visitedCount++;
361
383
  if (item.depth > this.options.maxDepth)
362
384
  return;
363
385
  if (this._resultCount + pending.size >= this.options.maxPages)
@@ -373,8 +395,6 @@ export class Spider {
373
395
  return;
374
396
  }
375
397
  }
376
- await this.crawlQueue.markVisited(normalized);
377
- this._visitedCount++;
378
398
  this.pendingCount++;
379
399
  const promise = this.pool.run(() => this.crawlPage({ ...item, url: normalized }))
380
400
  .finally(() => {
@@ -416,14 +436,33 @@ export class Spider {
416
436
  if (pending.size > 0) {
417
437
  await Promise.all(pending.values());
418
438
  }
439
+ while (!this.aborted && this._resultCount < this.options.maxPages) {
440
+ const remaining = await this.crawlQueue.size();
441
+ if (remaining === 0 && pending.size === 0)
442
+ break;
443
+ this._queueSize = remaining;
444
+ let nextItem = await this.crawlQueue.pop();
445
+ while (nextItem && !this.aborted) {
446
+ this._queueSize = Math.max(0, this._queueSize - 1);
447
+ if (this._resultCount + pending.size >= this.options.maxPages)
448
+ break;
449
+ await scheduleUrl(nextItem);
450
+ nextItem = await this.crawlQueue.pop();
451
+ }
452
+ if (pending.size > 0) {
453
+ await Promise.all(pending.values());
454
+ }
455
+ }
419
456
  this.running = false;
420
457
  const pages = await this.crawlStorage.getResults();
421
458
  const errors = await this.crawlStorage.getErrors();
422
459
  const sitemapAnalysis = this.buildSitemapAnalysis(pages);
423
460
  const robotsAnalysis = this.buildRobotsAnalysis();
424
461
  const visited = this.crawlQueue instanceof InMemoryCrawlQueue
425
- ? this.crawlQueue.getVisited()
462
+ ? new Set(this.crawlQueue.getVisited())
426
463
  : new Set(pages.map(r => r.url));
464
+ await this.crawlQueue.close?.();
465
+ await this.crawlStorage.close?.();
427
466
  return {
428
467
  startUrl: normalizedStart,
429
468
  pages,
@@ -615,6 +654,8 @@ export class Spider {
615
654
  let lastRetryAfterMs = 0;
616
655
  const proxyUrl = this.proxyAdapter ? await this.proxyAdapter.getProxy() : null;
617
656
  const executeRequest = async (useCurl) => {
657
+ if (this.aborted)
658
+ throw new Error('Crawl aborted');
618
659
  if (useCurl && this.curlTransport) {
619
660
  const curlForRequest = proxyUrl
620
661
  ? new CurlTransport(proxyUrl)
@@ -655,6 +696,7 @@ export class Spider {
655
696
  const clientForRequest = this.getClientForProxy(proxyUrl);
656
697
  const response = await clientForRequest.get(url, {
657
698
  headers: this.buildRequestHeaders(url, false),
699
+ signal: this.abortController.signal,
658
700
  beforeRedirect: this.options.onRedirect
659
701
  ? (info) => { this.options.onRedirect({ from: info.from, to: info.to, status: info.status }); }
660
702
  : undefined,
@@ -792,7 +834,9 @@ export class Spider {
792
834
  timings,
793
835
  });
794
836
  }
795
- await sleep(waitMs);
837
+ if (this.aborted)
838
+ break;
839
+ await sleep(waitMs, this.abortController.signal);
796
840
  continue;
797
841
  }
798
842
  catch (error) {
@@ -823,8 +867,10 @@ export class Spider {
823
867
  forcedTransport = 'curl';
824
868
  }
825
869
  }
870
+ if (this.aborted)
871
+ break;
826
872
  const waitMs = this.getRetryWait(hostname, attempt + 1);
827
- await sleep(waitMs);
873
+ await sleep(waitMs, this.abortController.signal);
828
874
  }
829
875
  }
830
876
  if (lastResponse) {
@@ -1059,6 +1105,8 @@ export class Spider {
1059
1105
  }
1060
1106
  }
1061
1107
  catch (error) {
1108
+ if (this.aborted)
1109
+ return;
1062
1110
  const errAttempts = typeof error?.attempts === 'number' ? error.attempts : 1;
1063
1111
  const errRetryCount = typeof error?.retryCount === 'number' ? error.retryCount : Math.max(0, errAttempts - 1);
1064
1112
  const errRetryAfterMs = typeof error?.retryAfterMs === 'number' ? error?.retryAfterMs : 0;
@@ -1248,7 +1296,7 @@ export class Spider {
1248
1296
  const now = Date.now();
1249
1297
  const delay = Math.max(state.penaltyUntil, state.challengeCooldownUntil) - now;
1250
1298
  if (delay > 0) {
1251
- await sleep(delay);
1299
+ await sleep(delay, this.abortController.signal);
1252
1300
  }
1253
1301
  }
1254
1302
  registerDomainBlock(hostname) {
@@ -1319,6 +1367,7 @@ export class Spider {
1319
1367
  }
1320
1368
  abort() {
1321
1369
  this.aborted = true;
1370
+ this.abortController.abort();
1322
1371
  }
1323
1372
  isRunning() {
1324
1373
  return this.running;
@@ -56,7 +56,7 @@ export class SeoSpider {
56
56
  });
57
57
  }
58
58
  async analyzePageDuringCrawl(pageResult, html) {
59
- if (pageResult.error || pageResult.status >= 400) {
59
+ if (pageResult.status >= 400 || !html) {
60
60
  const seoPage = { ...pageResult, seoReport: undefined };
61
61
  this.seoPages.push(seoPage);
62
62
  return;
@@ -517,6 +517,12 @@ export function detectBlock(response, body) {
517
517
  description: 'DataDome headers detected',
518
518
  });
519
519
  }
520
+ if (response.status === 200 &&
521
+ results.length === 0 &&
522
+ !location &&
523
+ !body) {
524
+ return { blocked: false, confidence: 0 };
525
+ }
520
526
  if (body) {
521
527
  const isLongBody = body.length > 100_000;
522
528
  const checkBody = isLongBody ? body.slice(0, 30_000) : body;
@@ -588,6 +594,12 @@ export function detectCaptcha(response, body) {
588
594
  addMatch(matches, 'cloudflare', 0.7, 'Cloudflare challenge headers detected');
589
595
  }
590
596
  }
597
+ if (matches.length === 0 &&
598
+ response.status >= 200 && response.status < 300 &&
599
+ !location &&
600
+ !body) {
601
+ return { detected: false, confidence: 0 };
602
+ }
591
603
  if (body) {
592
604
  const checkBody = body.length < 120_000 ? body : body.slice(0, 120_000);
593
605
  const isTinyBody = checkBody.length > 0 && checkBody.length < 12000;
package/dist/version.js CHANGED
@@ -1,4 +1,4 @@
1
- const VERSION = '1.0.94';
1
+ const VERSION = '1.0.95';
2
2
  let _version = null;
3
3
  export async function getVersion() {
4
4
  if (_version)
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "recker",
3
- "version": "1.0.94",
3
+ "version": "1.0.95",
4
4
  "description": "Multi-Protocol SDK for the AI Era - HTTP, WebSocket, DNS, FTP, SFTP, Telnet, HLS unified with AI providers and MCP tools",
5
5
  "main": "./dist/index.js",
6
6
  "types": "./dist/index.d.ts",