recker 1.0.93-next.40856cc → 1.0.93-next.554b0c4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,7 +3,7 @@ export type { Options as ParserOptions } from './parser/index.js';
3
3
  export { ScrapeDocument } from './document.js';
4
4
  export { ScrapeElement } from './element.js';
5
5
  export { Spider, spider } from './spider.js';
6
- export type { SpiderOptions, SpiderPageResult, SpiderProgress, SpiderResult, } from './spider.js';
6
+ export type { SpiderOptions, SpiderPageResult, SpiderPageEvent, SpiderProgress, SpiderResult, } from './spider.js';
7
7
  export { InMemoryCrawlQueue } from './crawl-queue.js';
8
8
  export type { CrawlQueueAdapter, CrawlQueueItem } from './crawl-queue.js';
9
9
  export { InMemoryCrawlStorage } from './crawl-storage.js';
@@ -1,3 +1,4 @@
1
+ import { ScrapeDocument } from './document.js';
1
2
  import type { ExtractedLink, ExtractionSchema } from './types.js';
2
3
  import type { Options as ParserOptions } from './parser/index.js';
3
4
  import { type SitemapUrl } from '../seo/validators/sitemap.js';
@@ -31,8 +32,7 @@ export interface SpiderOptions {
31
32
  proxy?: string | string[] | ProxyAdapter;
32
33
  transport?: SpiderTransport;
33
34
  preferCurlFirst?: boolean;
34
- onPage?: (result: SpiderPageResult) => void;
35
- onPageWithHtml?: (result: SpiderPageResult, html: string) => void | Promise<void>;
35
+ onPage?: (event: SpiderPageEvent) => void | Promise<void>;
36
36
  onCaptchaDetected?: (result: {
37
37
  url: string;
38
38
  status: number;
@@ -40,6 +40,23 @@ export interface SpiderOptions {
40
40
  provider?: CaptchaProvider;
41
41
  usedCurl: boolean;
42
42
  }) => void | Promise<void>;
43
+ onBlocked?: (result: SpiderPageResult) => void | Promise<void>;
44
+ onError?: (result: SpiderPageResult) => void | Promise<void>;
45
+ onRetry?: (info: {
46
+ url: string;
47
+ attempt: number;
48
+ maxAttempts: number;
49
+ reason?: string;
50
+ delay: number;
51
+ transport: SpiderTransport;
52
+ previousStatus: number;
53
+ timings?: SpiderPageResult['timings'];
54
+ }) => void | Promise<void>;
55
+ onRedirect?: (info: {
56
+ from: string;
57
+ to: string;
58
+ status: number;
59
+ }) => void | Promise<void>;
43
60
  onProgress?: (progress: SpiderProgress) => void;
44
61
  extract?: string[] | ExtractionSchema;
45
62
  parserOptions?: Partial<ParserOptions>;
@@ -103,6 +120,11 @@ export interface SpiderPageResult {
103
120
  };
104
121
  extracted?: Record<string, unknown>;
105
122
  }
123
+ export interface SpiderPageEvent {
124
+ result: SpiderPageResult;
125
+ html?: string;
126
+ document?: () => Promise<ScrapeDocument>;
127
+ }
106
128
  export interface SpiderProgress {
107
129
  crawled: number;
108
130
  queued: number;
@@ -239,8 +239,11 @@ export class Spider {
239
239
  exclude: options.exclude,
240
240
  include: options.include,
241
241
  onPage: options.onPage,
242
- onPageWithHtml: options.onPageWithHtml,
243
242
  onCaptchaDetected: options.onCaptchaDetected,
243
+ onBlocked: options.onBlocked,
244
+ onError: options.onError,
245
+ onRetry: options.onRetry,
246
+ onRedirect: options.onRedirect,
244
247
  onProgress: options.onProgress,
245
248
  extract: extractSchema,
246
249
  parserOptions: options.parserOptions,
@@ -611,6 +614,9 @@ export class Spider {
611
614
  const clientForRequest = this.getClientForProxy(proxyUrl);
612
615
  const response = await clientForRequest.get(url, {
613
616
  headers: this.buildRequestHeaders(url, false),
617
+ beforeRedirect: this.options.onRedirect
618
+ ? (info) => { this.options.onRedirect({ from: info.from, to: info.to, status: info.status }); }
619
+ : undefined,
614
620
  });
615
621
  const contentType = response.headers.get('content-type') || '';
616
622
  const shouldReadUndiciBody = !contentType ||
@@ -732,6 +738,18 @@ export class Spider {
732
738
  forcedTransport = 'curl';
733
739
  }
734
740
  }
741
+ if (this.options.onRetry) {
742
+ await this.options.onRetry({
743
+ url,
744
+ attempt: attempt + 1,
745
+ maxAttempts,
746
+ reason: attemptReason,
747
+ delay: waitMs,
748
+ transport: forcedTransport ?? transportForAttempt,
749
+ previousStatus: response.status,
750
+ timings,
751
+ });
752
+ }
735
753
  await sleep(waitMs);
736
754
  continue;
737
755
  }
@@ -867,7 +885,21 @@ export class Spider {
867
885
  };
868
886
  await this.crawlStorage.saveResult(nonHtmlResult);
869
887
  this._resultCount++;
870
- this.options.onPage?.(nonHtmlResult);
888
+ if (this.options.onPage) {
889
+ let cachedDoc = null;
890
+ await this.options.onPage({
891
+ result: nonHtmlResult,
892
+ html: html || undefined,
893
+ document: html ? () => {
894
+ if (cachedDoc)
895
+ return Promise.resolve(cachedDoc);
896
+ return ScrapeDocument.create(html, { baseUrl: item.url, parserOptions: this.options.parserOptions }).then(d => { cachedDoc = d; return d; });
897
+ } : undefined,
898
+ });
899
+ }
900
+ if (this.options.onBlocked && (detection.blocked || hasCaptcha)) {
901
+ await this.options.onBlocked(nonHtmlResult);
902
+ }
871
903
  return;
872
904
  }
873
905
  const doc = await ScrapeDocument.create(html, {
@@ -929,9 +961,12 @@ export class Spider {
929
961
  };
930
962
  await this.crawlStorage.saveResult(result);
931
963
  this._resultCount++;
932
- this.options.onPage?.(result);
933
- if (this.options.onPageWithHtml) {
934
- await this.options.onPageWithHtml(result, html);
964
+ if (this.options.onPage) {
965
+ await this.options.onPage({
966
+ result,
967
+ html,
968
+ document: () => Promise.resolve(doc),
969
+ });
935
970
  }
936
971
  const candidates = [];
937
972
  const candidateUrls = [];
@@ -1026,7 +1061,12 @@ export class Spider {
1026
1061
  await this.crawlStorage.saveResult(errorResult);
1027
1062
  this._resultCount++;
1028
1063
  await this.crawlStorage.saveError({ url: item.url, error: message });
1029
- this.options.onPage?.(errorResult);
1064
+ if (this.options.onPage) {
1065
+ await this.options.onPage({ result: errorResult });
1066
+ }
1067
+ if (this.options.onError) {
1068
+ await this.options.onError(errorResult);
1069
+ }
1030
1070
  }
1031
1071
  }
1032
1072
  getOrCreateDomainState(hostname) {
@@ -5,6 +5,8 @@ export interface SeoSpiderOptions extends SpiderOptions {
5
5
  seo?: boolean;
6
6
  output?: string;
7
7
  onSeoAnalysis?: (result: SeoPageResult) => void;
8
+ onBlocked?: (result: SeoPageResult) => void | Promise<void>;
9
+ onError?: (result: SeoPageResult) => void | Promise<void>;
8
10
  focusCategories?: string[];
9
11
  focusMode?: 'all' | 'links' | 'duplicates' | 'security' | 'ai' | 'resources';
10
12
  }
@@ -32,11 +32,25 @@ export class SeoSpider {
32
32
  }
33
33
  constructor(options = {}) {
34
34
  this.options = options;
35
+ const userOnPage = options.onPage;
35
36
  this.spider = new Spider({
36
37
  ...options,
37
- onPageWithHtml: this.options.seo
38
- ? async (pageResult, html) => {
39
- await this.analyzePageDuringCrawl(pageResult, html);
38
+ onPage: async (event) => {
39
+ if (this.options.seo && event.html) {
40
+ await this.analyzePageDuringCrawl(event.result, event.html);
41
+ }
42
+ if (userOnPage) {
43
+ await userOnPage(event);
44
+ }
45
+ },
46
+ onBlocked: this.options.onBlocked
47
+ ? async (pageResult) => {
48
+ await this.options.onBlocked({ ...pageResult });
49
+ }
50
+ : undefined,
51
+ onError: this.options.onError
52
+ ? async (pageResult) => {
53
+ await this.options.onError({ ...pageResult });
40
54
  }
41
55
  : undefined,
42
56
  });
@@ -102,7 +102,8 @@ export class SpiderRunner extends CommandEmitter {
102
102
  extract,
103
103
  include: include?.map(p => new RegExp(p)),
104
104
  exclude: exclude?.map(p => new RegExp(p)),
105
- onPage: (page) => {
105
+ onPage: (event) => {
106
+ const page = event.result;
106
107
  collectPageMetrics(page);
107
108
  pages.push({
108
109
  url: page.url,
@@ -57,7 +57,8 @@ export class SpiderJob {
57
57
  errors: 0,
58
58
  });
59
59
  },
60
- onPage: (result) => {
60
+ onPage: (event) => {
61
+ const result = event.result;
61
62
  if (result.error) {
62
63
  const currentProgress = this.job.progress;
63
64
  this.manager.updateProgress(this.job.id, {
@@ -3,7 +3,7 @@ export type { Options as ParserOptions } from './parser/index.js';
3
3
  export { ScrapeDocument } from './document.js';
4
4
  export { ScrapeElement } from './element.js';
5
5
  export { Spider, spider } from './spider.js';
6
- export type { SpiderOptions, SpiderPageResult, SpiderProgress, SpiderResult, } from './spider.js';
6
+ export type { SpiderOptions, SpiderPageResult, SpiderPageEvent, SpiderProgress, SpiderResult, } from './spider.js';
7
7
  export { InMemoryCrawlQueue } from './crawl-queue.js';
8
8
  export type { CrawlQueueAdapter, CrawlQueueItem } from './crawl-queue.js';
9
9
  export { InMemoryCrawlStorage } from './crawl-storage.js';
@@ -1,3 +1,4 @@
1
+ import { ScrapeDocument } from './document.js';
1
2
  import type { ExtractedLink, ExtractionSchema } from './types.js';
2
3
  import type { Options as ParserOptions } from './parser/index.js';
3
4
  import { type SitemapUrl } from '../seo/validators/sitemap.js';
@@ -31,8 +32,7 @@ export interface SpiderOptions {
31
32
  proxy?: string | string[] | ProxyAdapter;
32
33
  transport?: SpiderTransport;
33
34
  preferCurlFirst?: boolean;
34
- onPage?: (result: SpiderPageResult) => void;
35
- onPageWithHtml?: (result: SpiderPageResult, html: string) => void | Promise<void>;
35
+ onPage?: (event: SpiderPageEvent) => void | Promise<void>;
36
36
  onCaptchaDetected?: (result: {
37
37
  url: string;
38
38
  status: number;
@@ -40,6 +40,23 @@ export interface SpiderOptions {
40
40
  provider?: CaptchaProvider;
41
41
  usedCurl: boolean;
42
42
  }) => void | Promise<void>;
43
+ onBlocked?: (result: SpiderPageResult) => void | Promise<void>;
44
+ onError?: (result: SpiderPageResult) => void | Promise<void>;
45
+ onRetry?: (info: {
46
+ url: string;
47
+ attempt: number;
48
+ maxAttempts: number;
49
+ reason?: string;
50
+ delay: number;
51
+ transport: SpiderTransport;
52
+ previousStatus: number;
53
+ timings?: SpiderPageResult['timings'];
54
+ }) => void | Promise<void>;
55
+ onRedirect?: (info: {
56
+ from: string;
57
+ to: string;
58
+ status: number;
59
+ }) => void | Promise<void>;
43
60
  onProgress?: (progress: SpiderProgress) => void;
44
61
  extract?: string[] | ExtractionSchema;
45
62
  parserOptions?: Partial<ParserOptions>;
@@ -103,6 +120,11 @@ export interface SpiderPageResult {
103
120
  };
104
121
  extracted?: Record<string, unknown>;
105
122
  }
123
+ export interface SpiderPageEvent {
124
+ result: SpiderPageResult;
125
+ html?: string;
126
+ document?: () => Promise<ScrapeDocument>;
127
+ }
106
128
  export interface SpiderProgress {
107
129
  crawled: number;
108
130
  queued: number;
@@ -239,8 +239,11 @@ export class Spider {
239
239
  exclude: options.exclude,
240
240
  include: options.include,
241
241
  onPage: options.onPage,
242
- onPageWithHtml: options.onPageWithHtml,
243
242
  onCaptchaDetected: options.onCaptchaDetected,
243
+ onBlocked: options.onBlocked,
244
+ onError: options.onError,
245
+ onRetry: options.onRetry,
246
+ onRedirect: options.onRedirect,
244
247
  onProgress: options.onProgress,
245
248
  extract: extractSchema,
246
249
  parserOptions: options.parserOptions,
@@ -611,6 +614,9 @@ export class Spider {
611
614
  const clientForRequest = this.getClientForProxy(proxyUrl);
612
615
  const response = await clientForRequest.get(url, {
613
616
  headers: this.buildRequestHeaders(url, false),
617
+ beforeRedirect: this.options.onRedirect
618
+ ? (info) => { this.options.onRedirect({ from: info.from, to: info.to, status: info.status }); }
619
+ : undefined,
614
620
  });
615
621
  const contentType = response.headers.get('content-type') || '';
616
622
  const shouldReadUndiciBody = !contentType ||
@@ -732,6 +738,18 @@ export class Spider {
732
738
  forcedTransport = 'curl';
733
739
  }
734
740
  }
741
+ if (this.options.onRetry) {
742
+ await this.options.onRetry({
743
+ url,
744
+ attempt: attempt + 1,
745
+ maxAttempts,
746
+ reason: attemptReason,
747
+ delay: waitMs,
748
+ transport: forcedTransport ?? transportForAttempt,
749
+ previousStatus: response.status,
750
+ timings,
751
+ });
752
+ }
735
753
  await sleep(waitMs);
736
754
  continue;
737
755
  }
@@ -867,7 +885,21 @@ export class Spider {
867
885
  };
868
886
  await this.crawlStorage.saveResult(nonHtmlResult);
869
887
  this._resultCount++;
870
- this.options.onPage?.(nonHtmlResult);
888
+ if (this.options.onPage) {
889
+ let cachedDoc = null;
890
+ await this.options.onPage({
891
+ result: nonHtmlResult,
892
+ html: html || undefined,
893
+ document: html ? () => {
894
+ if (cachedDoc)
895
+ return Promise.resolve(cachedDoc);
896
+ return ScrapeDocument.create(html, { baseUrl: item.url, parserOptions: this.options.parserOptions }).then(d => { cachedDoc = d; return d; });
897
+ } : undefined,
898
+ });
899
+ }
900
+ if (this.options.onBlocked && (detection.blocked || hasCaptcha)) {
901
+ await this.options.onBlocked(nonHtmlResult);
902
+ }
871
903
  return;
872
904
  }
873
905
  const doc = await ScrapeDocument.create(html, {
@@ -929,9 +961,12 @@ export class Spider {
929
961
  };
930
962
  await this.crawlStorage.saveResult(result);
931
963
  this._resultCount++;
932
- this.options.onPage?.(result);
933
- if (this.options.onPageWithHtml) {
934
- await this.options.onPageWithHtml(result, html);
964
+ if (this.options.onPage) {
965
+ await this.options.onPage({
966
+ result,
967
+ html,
968
+ document: () => Promise.resolve(doc),
969
+ });
935
970
  }
936
971
  const candidates = [];
937
972
  const candidateUrls = [];
@@ -1026,7 +1061,12 @@ export class Spider {
1026
1061
  await this.crawlStorage.saveResult(errorResult);
1027
1062
  this._resultCount++;
1028
1063
  await this.crawlStorage.saveError({ url: item.url, error: message });
1029
- this.options.onPage?.(errorResult);
1064
+ if (this.options.onPage) {
1065
+ await this.options.onPage({ result: errorResult });
1066
+ }
1067
+ if (this.options.onError) {
1068
+ await this.options.onError(errorResult);
1069
+ }
1030
1070
  }
1031
1071
  }
1032
1072
  getOrCreateDomainState(hostname) {
@@ -5,6 +5,8 @@ export interface SeoSpiderOptions extends SpiderOptions {
5
5
  seo?: boolean;
6
6
  output?: string;
7
7
  onSeoAnalysis?: (result: SeoPageResult) => void;
8
+ onBlocked?: (result: SeoPageResult) => void | Promise<void>;
9
+ onError?: (result: SeoPageResult) => void | Promise<void>;
8
10
  focusCategories?: string[];
9
11
  focusMode?: 'all' | 'links' | 'duplicates' | 'security' | 'ai' | 'resources';
10
12
  }
@@ -32,11 +32,25 @@ export class SeoSpider {
32
32
  }
33
33
  constructor(options = {}) {
34
34
  this.options = options;
35
+ const userOnPage = options.onPage;
35
36
  this.spider = new Spider({
36
37
  ...options,
37
- onPageWithHtml: this.options.seo
38
- ? async (pageResult, html) => {
39
- await this.analyzePageDuringCrawl(pageResult, html);
38
+ onPage: async (event) => {
39
+ if (this.options.seo && event.html) {
40
+ await this.analyzePageDuringCrawl(event.result, event.html);
41
+ }
42
+ if (userOnPage) {
43
+ await userOnPage(event);
44
+ }
45
+ },
46
+ onBlocked: this.options.onBlocked
47
+ ? async (pageResult) => {
48
+ await this.options.onBlocked({ ...pageResult });
49
+ }
50
+ : undefined,
51
+ onError: this.options.onError
52
+ ? async (pageResult) => {
53
+ await this.options.onError({ ...pageResult });
40
54
  }
41
55
  : undefined,
42
56
  });
package/dist/version.js CHANGED
@@ -1,4 +1,4 @@
1
- const VERSION = '1.0.93-next.40856cc';
1
+ const VERSION = '1.0.93-next.554b0c4';
2
2
  let _version = null;
3
3
  export async function getVersion() {
4
4
  if (_version)
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "recker",
3
- "version": "1.0.93-next.40856cc",
3
+ "version": "1.0.93-next.554b0c4",
4
4
  "description": "Multi-Protocol SDK for the AI Era - HTTP, WebSocket, DNS, FTP, SFTP, Telnet, HLS unified with AI providers and MCP tools",
5
5
  "main": "./dist/index.js",
6
6
  "types": "./dist/index.d.ts",