recker 1.0.93-next.40856cc → 1.0.93-next.c2e60bf

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -40,6 +40,23 @@ export interface SpiderOptions {
40
40
  provider?: CaptchaProvider;
41
41
  usedCurl: boolean;
42
42
  }) => void | Promise<void>;
43
+ onBlocked?: (result: SpiderPageResult) => void | Promise<void>;
44
+ onError?: (result: SpiderPageResult) => void | Promise<void>;
45
+ onRetry?: (info: {
46
+ url: string;
47
+ attempt: number;
48
+ maxAttempts: number;
49
+ reason?: string;
50
+ delay: number;
51
+ transport: SpiderTransport;
52
+ previousStatus: number;
53
+ timings?: SpiderPageResult['timings'];
54
+ }) => void | Promise<void>;
55
+ onRedirect?: (info: {
56
+ from: string;
57
+ to: string;
58
+ status: number;
59
+ }) => void | Promise<void>;
43
60
  onProgress?: (progress: SpiderProgress) => void;
44
61
  extract?: string[] | ExtractionSchema;
45
62
  parserOptions?: Partial<ParserOptions>;
@@ -241,6 +241,10 @@ export class Spider {
241
241
  onPage: options.onPage,
242
242
  onPageWithHtml: options.onPageWithHtml,
243
243
  onCaptchaDetected: options.onCaptchaDetected,
244
+ onBlocked: options.onBlocked,
245
+ onError: options.onError,
246
+ onRetry: options.onRetry,
247
+ onRedirect: options.onRedirect,
244
248
  onProgress: options.onProgress,
245
249
  extract: extractSchema,
246
250
  parserOptions: options.parserOptions,
@@ -611,6 +615,9 @@ export class Spider {
611
615
  const clientForRequest = this.getClientForProxy(proxyUrl);
612
616
  const response = await clientForRequest.get(url, {
613
617
  headers: this.buildRequestHeaders(url, false),
618
+ beforeRedirect: this.options.onRedirect
619
+ ? (info) => { this.options.onRedirect({ from: info.from, to: info.to, status: info.status }); }
620
+ : undefined,
614
621
  });
615
622
  const contentType = response.headers.get('content-type') || '';
616
623
  const shouldReadUndiciBody = !contentType ||
@@ -732,6 +739,18 @@ export class Spider {
732
739
  forcedTransport = 'curl';
733
740
  }
734
741
  }
742
+ if (this.options.onRetry) {
743
+ await this.options.onRetry({
744
+ url,
745
+ attempt: attempt + 1,
746
+ maxAttempts,
747
+ reason: attemptReason,
748
+ delay: waitMs,
749
+ transport: forcedTransport ?? transportForAttempt,
750
+ previousStatus: response.status,
751
+ timings,
752
+ });
753
+ }
735
754
  await sleep(waitMs);
736
755
  continue;
737
756
  }
@@ -868,6 +887,12 @@ export class Spider {
868
887
  await this.crawlStorage.saveResult(nonHtmlResult);
869
888
  this._resultCount++;
870
889
  this.options.onPage?.(nonHtmlResult);
890
+ if (this.options.onPageWithHtml && html) {
891
+ await this.options.onPageWithHtml(nonHtmlResult, html);
892
+ }
893
+ if (this.options.onBlocked && (detection.blocked || hasCaptcha)) {
894
+ await this.options.onBlocked(nonHtmlResult);
895
+ }
871
896
  return;
872
897
  }
873
898
  const doc = await ScrapeDocument.create(html, {
@@ -1027,6 +1052,9 @@ export class Spider {
1027
1052
  this._resultCount++;
1028
1053
  await this.crawlStorage.saveError({ url: item.url, error: message });
1029
1054
  this.options.onPage?.(errorResult);
1055
+ if (this.options.onError) {
1056
+ await this.options.onError(errorResult);
1057
+ }
1030
1058
  }
1031
1059
  }
1032
1060
  getOrCreateDomainState(hostname) {
@@ -5,6 +5,8 @@ export interface SeoSpiderOptions extends SpiderOptions {
5
5
  seo?: boolean;
6
6
  output?: string;
7
7
  onSeoAnalysis?: (result: SeoPageResult) => void;
8
+ onBlocked?: (result: SeoPageResult) => void | Promise<void>;
9
+ onError?: (result: SeoPageResult) => void | Promise<void>;
8
10
  focusCategories?: string[];
9
11
  focusMode?: 'all' | 'links' | 'duplicates' | 'security' | 'ai' | 'resources';
10
12
  }
@@ -39,6 +39,16 @@ export class SeoSpider {
39
39
  await this.analyzePageDuringCrawl(pageResult, html);
40
40
  }
41
41
  : undefined,
42
+ onBlocked: this.options.onBlocked
43
+ ? async (pageResult) => {
44
+ await this.options.onBlocked({ ...pageResult });
45
+ }
46
+ : undefined,
47
+ onError: this.options.onError
48
+ ? async (pageResult) => {
49
+ await this.options.onError({ ...pageResult });
50
+ }
51
+ : undefined,
42
52
  });
43
53
  }
44
54
  async analyzePageDuringCrawl(pageResult, html) {
@@ -40,6 +40,23 @@ export interface SpiderOptions {
40
40
  provider?: CaptchaProvider;
41
41
  usedCurl: boolean;
42
42
  }) => void | Promise<void>;
43
+ onBlocked?: (result: SpiderPageResult) => void | Promise<void>;
44
+ onError?: (result: SpiderPageResult) => void | Promise<void>;
45
+ onRetry?: (info: {
46
+ url: string;
47
+ attempt: number;
48
+ maxAttempts: number;
49
+ reason?: string;
50
+ delay: number;
51
+ transport: SpiderTransport;
52
+ previousStatus: number;
53
+ timings?: SpiderPageResult['timings'];
54
+ }) => void | Promise<void>;
55
+ onRedirect?: (info: {
56
+ from: string;
57
+ to: string;
58
+ status: number;
59
+ }) => void | Promise<void>;
43
60
  onProgress?: (progress: SpiderProgress) => void;
44
61
  extract?: string[] | ExtractionSchema;
45
62
  parserOptions?: Partial<ParserOptions>;
@@ -241,6 +241,10 @@ export class Spider {
241
241
  onPage: options.onPage,
242
242
  onPageWithHtml: options.onPageWithHtml,
243
243
  onCaptchaDetected: options.onCaptchaDetected,
244
+ onBlocked: options.onBlocked,
245
+ onError: options.onError,
246
+ onRetry: options.onRetry,
247
+ onRedirect: options.onRedirect,
244
248
  onProgress: options.onProgress,
245
249
  extract: extractSchema,
246
250
  parserOptions: options.parserOptions,
@@ -611,6 +615,9 @@ export class Spider {
611
615
  const clientForRequest = this.getClientForProxy(proxyUrl);
612
616
  const response = await clientForRequest.get(url, {
613
617
  headers: this.buildRequestHeaders(url, false),
618
+ beforeRedirect: this.options.onRedirect
619
+ ? (info) => { this.options.onRedirect({ from: info.from, to: info.to, status: info.status }); }
620
+ : undefined,
614
621
  });
615
622
  const contentType = response.headers.get('content-type') || '';
616
623
  const shouldReadUndiciBody = !contentType ||
@@ -732,6 +739,18 @@ export class Spider {
732
739
  forcedTransport = 'curl';
733
740
  }
734
741
  }
742
+ if (this.options.onRetry) {
743
+ await this.options.onRetry({
744
+ url,
745
+ attempt: attempt + 1,
746
+ maxAttempts,
747
+ reason: attemptReason,
748
+ delay: waitMs,
749
+ transport: forcedTransport ?? transportForAttempt,
750
+ previousStatus: response.status,
751
+ timings,
752
+ });
753
+ }
735
754
  await sleep(waitMs);
736
755
  continue;
737
756
  }
@@ -868,6 +887,12 @@ export class Spider {
868
887
  await this.crawlStorage.saveResult(nonHtmlResult);
869
888
  this._resultCount++;
870
889
  this.options.onPage?.(nonHtmlResult);
890
+ if (this.options.onPageWithHtml && html) {
891
+ await this.options.onPageWithHtml(nonHtmlResult, html);
892
+ }
893
+ if (this.options.onBlocked && (detection.blocked || hasCaptcha)) {
894
+ await this.options.onBlocked(nonHtmlResult);
895
+ }
871
896
  return;
872
897
  }
873
898
  const doc = await ScrapeDocument.create(html, {
@@ -1027,6 +1052,9 @@ export class Spider {
1027
1052
  this._resultCount++;
1028
1053
  await this.crawlStorage.saveError({ url: item.url, error: message });
1029
1054
  this.options.onPage?.(errorResult);
1055
+ if (this.options.onError) {
1056
+ await this.options.onError(errorResult);
1057
+ }
1030
1058
  }
1031
1059
  }
1032
1060
  getOrCreateDomainState(hostname) {
@@ -5,6 +5,8 @@ export interface SeoSpiderOptions extends SpiderOptions {
5
5
  seo?: boolean;
6
6
  output?: string;
7
7
  onSeoAnalysis?: (result: SeoPageResult) => void;
8
+ onBlocked?: (result: SeoPageResult) => void | Promise<void>;
9
+ onError?: (result: SeoPageResult) => void | Promise<void>;
8
10
  focusCategories?: string[];
9
11
  focusMode?: 'all' | 'links' | 'duplicates' | 'security' | 'ai' | 'resources';
10
12
  }
@@ -39,6 +39,16 @@ export class SeoSpider {
39
39
  await this.analyzePageDuringCrawl(pageResult, html);
40
40
  }
41
41
  : undefined,
42
+ onBlocked: this.options.onBlocked
43
+ ? async (pageResult) => {
44
+ await this.options.onBlocked({ ...pageResult });
45
+ }
46
+ : undefined,
47
+ onError: this.options.onError
48
+ ? async (pageResult) => {
49
+ await this.options.onError({ ...pageResult });
50
+ }
51
+ : undefined,
42
52
  });
43
53
  }
44
54
  async analyzePageDuringCrawl(pageResult, html) {
package/dist/version.js CHANGED
@@ -1,4 +1,4 @@
1
- const VERSION = '1.0.93-next.40856cc';
1
+ const VERSION = '1.0.93-next.c2e60bf';
2
2
  let _version = null;
3
3
  export async function getVersion() {
4
4
  if (_version)
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "recker",
3
- "version": "1.0.93-next.40856cc",
3
+ "version": "1.0.93-next.c2e60bf",
4
4
  "description": "Multi-Protocol SDK for the AI Era - HTTP, WebSocket, DNS, FTP, SFTP, Telnet, HLS unified with AI providers and MCP tools",
5
5
  "main": "./dist/index.js",
6
6
  "types": "./dist/index.d.ts",