recker 1.0.93-next.10b1d32 → 1.0.93-next.c2e60bf
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/browser/scrape/spider.d.ts +17 -0
- package/dist/browser/scrape/spider.js +28 -0
- package/dist/browser/seo/seo-spider.d.ts +2 -0
- package/dist/browser/seo/seo-spider.js +10 -0
- package/dist/cli/index.js +0 -3
- package/dist/scrape/spider.d.ts +17 -0
- package/dist/scrape/spider.js +28 -0
- package/dist/seo/seo-spider.d.ts +2 -0
- package/dist/seo/seo-spider.js +10 -0
- package/dist/version.js +1 -1
- package/package.json +1 -1
|
@@ -40,6 +40,23 @@ export interface SpiderOptions {
|
|
|
40
40
|
provider?: CaptchaProvider;
|
|
41
41
|
usedCurl: boolean;
|
|
42
42
|
}) => void | Promise<void>;
|
|
43
|
+
onBlocked?: (result: SpiderPageResult) => void | Promise<void>;
|
|
44
|
+
onError?: (result: SpiderPageResult) => void | Promise<void>;
|
|
45
|
+
onRetry?: (info: {
|
|
46
|
+
url: string;
|
|
47
|
+
attempt: number;
|
|
48
|
+
maxAttempts: number;
|
|
49
|
+
reason?: string;
|
|
50
|
+
delay: number;
|
|
51
|
+
transport: SpiderTransport;
|
|
52
|
+
previousStatus: number;
|
|
53
|
+
timings?: SpiderPageResult['timings'];
|
|
54
|
+
}) => void | Promise<void>;
|
|
55
|
+
onRedirect?: (info: {
|
|
56
|
+
from: string;
|
|
57
|
+
to: string;
|
|
58
|
+
status: number;
|
|
59
|
+
}) => void | Promise<void>;
|
|
43
60
|
onProgress?: (progress: SpiderProgress) => void;
|
|
44
61
|
extract?: string[] | ExtractionSchema;
|
|
45
62
|
parserOptions?: Partial<ParserOptions>;
|
|
@@ -241,6 +241,10 @@ export class Spider {
|
|
|
241
241
|
onPage: options.onPage,
|
|
242
242
|
onPageWithHtml: options.onPageWithHtml,
|
|
243
243
|
onCaptchaDetected: options.onCaptchaDetected,
|
|
244
|
+
onBlocked: options.onBlocked,
|
|
245
|
+
onError: options.onError,
|
|
246
|
+
onRetry: options.onRetry,
|
|
247
|
+
onRedirect: options.onRedirect,
|
|
244
248
|
onProgress: options.onProgress,
|
|
245
249
|
extract: extractSchema,
|
|
246
250
|
parserOptions: options.parserOptions,
|
|
@@ -611,6 +615,9 @@ export class Spider {
|
|
|
611
615
|
const clientForRequest = this.getClientForProxy(proxyUrl);
|
|
612
616
|
const response = await clientForRequest.get(url, {
|
|
613
617
|
headers: this.buildRequestHeaders(url, false),
|
|
618
|
+
beforeRedirect: this.options.onRedirect
|
|
619
|
+
? (info) => { this.options.onRedirect({ from: info.from, to: info.to, status: info.status }); }
|
|
620
|
+
: undefined,
|
|
614
621
|
});
|
|
615
622
|
const contentType = response.headers.get('content-type') || '';
|
|
616
623
|
const shouldReadUndiciBody = !contentType ||
|
|
@@ -732,6 +739,18 @@ export class Spider {
|
|
|
732
739
|
forcedTransport = 'curl';
|
|
733
740
|
}
|
|
734
741
|
}
|
|
742
|
+
if (this.options.onRetry) {
|
|
743
|
+
await this.options.onRetry({
|
|
744
|
+
url,
|
|
745
|
+
attempt: attempt + 1,
|
|
746
|
+
maxAttempts,
|
|
747
|
+
reason: attemptReason,
|
|
748
|
+
delay: waitMs,
|
|
749
|
+
transport: forcedTransport ?? transportForAttempt,
|
|
750
|
+
previousStatus: response.status,
|
|
751
|
+
timings,
|
|
752
|
+
});
|
|
753
|
+
}
|
|
735
754
|
await sleep(waitMs);
|
|
736
755
|
continue;
|
|
737
756
|
}
|
|
@@ -868,6 +887,12 @@ export class Spider {
|
|
|
868
887
|
await this.crawlStorage.saveResult(nonHtmlResult);
|
|
869
888
|
this._resultCount++;
|
|
870
889
|
this.options.onPage?.(nonHtmlResult);
|
|
890
|
+
if (this.options.onPageWithHtml && html) {
|
|
891
|
+
await this.options.onPageWithHtml(nonHtmlResult, html);
|
|
892
|
+
}
|
|
893
|
+
if (this.options.onBlocked && (detection.blocked || hasCaptcha)) {
|
|
894
|
+
await this.options.onBlocked(nonHtmlResult);
|
|
895
|
+
}
|
|
871
896
|
return;
|
|
872
897
|
}
|
|
873
898
|
const doc = await ScrapeDocument.create(html, {
|
|
@@ -1027,6 +1052,9 @@ export class Spider {
|
|
|
1027
1052
|
this._resultCount++;
|
|
1028
1053
|
await this.crawlStorage.saveError({ url: item.url, error: message });
|
|
1029
1054
|
this.options.onPage?.(errorResult);
|
|
1055
|
+
if (this.options.onError) {
|
|
1056
|
+
await this.options.onError(errorResult);
|
|
1057
|
+
}
|
|
1030
1058
|
}
|
|
1031
1059
|
}
|
|
1032
1060
|
getOrCreateDomainState(hostname) {
|
|
@@ -5,6 +5,8 @@ export interface SeoSpiderOptions extends SpiderOptions {
|
|
|
5
5
|
seo?: boolean;
|
|
6
6
|
output?: string;
|
|
7
7
|
onSeoAnalysis?: (result: SeoPageResult) => void;
|
|
8
|
+
onBlocked?: (result: SeoPageResult) => void | Promise<void>;
|
|
9
|
+
onError?: (result: SeoPageResult) => void | Promise<void>;
|
|
8
10
|
focusCategories?: string[];
|
|
9
11
|
focusMode?: 'all' | 'links' | 'duplicates' | 'security' | 'ai' | 'resources';
|
|
10
12
|
}
|
|
@@ -39,6 +39,16 @@ export class SeoSpider {
|
|
|
39
39
|
await this.analyzePageDuringCrawl(pageResult, html);
|
|
40
40
|
}
|
|
41
41
|
: undefined,
|
|
42
|
+
onBlocked: this.options.onBlocked
|
|
43
|
+
? async (pageResult) => {
|
|
44
|
+
await this.options.onBlocked({ ...pageResult });
|
|
45
|
+
}
|
|
46
|
+
: undefined,
|
|
47
|
+
onError: this.options.onError
|
|
48
|
+
? async (pageResult) => {
|
|
49
|
+
await this.options.onError({ ...pageResult });
|
|
50
|
+
}
|
|
51
|
+
: undefined,
|
|
42
52
|
});
|
|
43
53
|
}
|
|
44
54
|
async analyzePageDuringCrawl(pageResult, html) {
|
package/dist/cli/index.js
CHANGED
|
@@ -51,9 +51,6 @@ async function main() {
|
|
|
51
51
|
const { handleRequest } = await import('./handler.js');
|
|
52
52
|
const { resolvePreset } = await import('./presets.js');
|
|
53
53
|
const presets = await import('../presets/index.js');
|
|
54
|
-
import('../utils/binary-manager.js')
|
|
55
|
-
.then(({ ensureCurlImpersonate }) => ensureCurlImpersonate(console))
|
|
56
|
-
.catch(() => { });
|
|
57
54
|
const version = await getVersion();
|
|
58
55
|
function parseMixedArgs(args, initialClientOptions = {}) {
|
|
59
56
|
const headers = { ...initialClientOptions.headers };
|
package/dist/scrape/spider.d.ts
CHANGED
|
@@ -40,6 +40,23 @@ export interface SpiderOptions {
|
|
|
40
40
|
provider?: CaptchaProvider;
|
|
41
41
|
usedCurl: boolean;
|
|
42
42
|
}) => void | Promise<void>;
|
|
43
|
+
onBlocked?: (result: SpiderPageResult) => void | Promise<void>;
|
|
44
|
+
onError?: (result: SpiderPageResult) => void | Promise<void>;
|
|
45
|
+
onRetry?: (info: {
|
|
46
|
+
url: string;
|
|
47
|
+
attempt: number;
|
|
48
|
+
maxAttempts: number;
|
|
49
|
+
reason?: string;
|
|
50
|
+
delay: number;
|
|
51
|
+
transport: SpiderTransport;
|
|
52
|
+
previousStatus: number;
|
|
53
|
+
timings?: SpiderPageResult['timings'];
|
|
54
|
+
}) => void | Promise<void>;
|
|
55
|
+
onRedirect?: (info: {
|
|
56
|
+
from: string;
|
|
57
|
+
to: string;
|
|
58
|
+
status: number;
|
|
59
|
+
}) => void | Promise<void>;
|
|
43
60
|
onProgress?: (progress: SpiderProgress) => void;
|
|
44
61
|
extract?: string[] | ExtractionSchema;
|
|
45
62
|
parserOptions?: Partial<ParserOptions>;
|
package/dist/scrape/spider.js
CHANGED
|
@@ -241,6 +241,10 @@ export class Spider {
|
|
|
241
241
|
onPage: options.onPage,
|
|
242
242
|
onPageWithHtml: options.onPageWithHtml,
|
|
243
243
|
onCaptchaDetected: options.onCaptchaDetected,
|
|
244
|
+
onBlocked: options.onBlocked,
|
|
245
|
+
onError: options.onError,
|
|
246
|
+
onRetry: options.onRetry,
|
|
247
|
+
onRedirect: options.onRedirect,
|
|
244
248
|
onProgress: options.onProgress,
|
|
245
249
|
extract: extractSchema,
|
|
246
250
|
parserOptions: options.parserOptions,
|
|
@@ -611,6 +615,9 @@ export class Spider {
|
|
|
611
615
|
const clientForRequest = this.getClientForProxy(proxyUrl);
|
|
612
616
|
const response = await clientForRequest.get(url, {
|
|
613
617
|
headers: this.buildRequestHeaders(url, false),
|
|
618
|
+
beforeRedirect: this.options.onRedirect
|
|
619
|
+
? (info) => { this.options.onRedirect({ from: info.from, to: info.to, status: info.status }); }
|
|
620
|
+
: undefined,
|
|
614
621
|
});
|
|
615
622
|
const contentType = response.headers.get('content-type') || '';
|
|
616
623
|
const shouldReadUndiciBody = !contentType ||
|
|
@@ -732,6 +739,18 @@ export class Spider {
|
|
|
732
739
|
forcedTransport = 'curl';
|
|
733
740
|
}
|
|
734
741
|
}
|
|
742
|
+
if (this.options.onRetry) {
|
|
743
|
+
await this.options.onRetry({
|
|
744
|
+
url,
|
|
745
|
+
attempt: attempt + 1,
|
|
746
|
+
maxAttempts,
|
|
747
|
+
reason: attemptReason,
|
|
748
|
+
delay: waitMs,
|
|
749
|
+
transport: forcedTransport ?? transportForAttempt,
|
|
750
|
+
previousStatus: response.status,
|
|
751
|
+
timings,
|
|
752
|
+
});
|
|
753
|
+
}
|
|
735
754
|
await sleep(waitMs);
|
|
736
755
|
continue;
|
|
737
756
|
}
|
|
@@ -868,6 +887,12 @@ export class Spider {
|
|
|
868
887
|
await this.crawlStorage.saveResult(nonHtmlResult);
|
|
869
888
|
this._resultCount++;
|
|
870
889
|
this.options.onPage?.(nonHtmlResult);
|
|
890
|
+
if (this.options.onPageWithHtml && html) {
|
|
891
|
+
await this.options.onPageWithHtml(nonHtmlResult, html);
|
|
892
|
+
}
|
|
893
|
+
if (this.options.onBlocked && (detection.blocked || hasCaptcha)) {
|
|
894
|
+
await this.options.onBlocked(nonHtmlResult);
|
|
895
|
+
}
|
|
871
896
|
return;
|
|
872
897
|
}
|
|
873
898
|
const doc = await ScrapeDocument.create(html, {
|
|
@@ -1027,6 +1052,9 @@ export class Spider {
|
|
|
1027
1052
|
this._resultCount++;
|
|
1028
1053
|
await this.crawlStorage.saveError({ url: item.url, error: message });
|
|
1029
1054
|
this.options.onPage?.(errorResult);
|
|
1055
|
+
if (this.options.onError) {
|
|
1056
|
+
await this.options.onError(errorResult);
|
|
1057
|
+
}
|
|
1030
1058
|
}
|
|
1031
1059
|
}
|
|
1032
1060
|
getOrCreateDomainState(hostname) {
|
package/dist/seo/seo-spider.d.ts
CHANGED
|
@@ -5,6 +5,8 @@ export interface SeoSpiderOptions extends SpiderOptions {
|
|
|
5
5
|
seo?: boolean;
|
|
6
6
|
output?: string;
|
|
7
7
|
onSeoAnalysis?: (result: SeoPageResult) => void;
|
|
8
|
+
onBlocked?: (result: SeoPageResult) => void | Promise<void>;
|
|
9
|
+
onError?: (result: SeoPageResult) => void | Promise<void>;
|
|
8
10
|
focusCategories?: string[];
|
|
9
11
|
focusMode?: 'all' | 'links' | 'duplicates' | 'security' | 'ai' | 'resources';
|
|
10
12
|
}
|
package/dist/seo/seo-spider.js
CHANGED
|
@@ -39,6 +39,16 @@ export class SeoSpider {
|
|
|
39
39
|
await this.analyzePageDuringCrawl(pageResult, html);
|
|
40
40
|
}
|
|
41
41
|
: undefined,
|
|
42
|
+
onBlocked: this.options.onBlocked
|
|
43
|
+
? async (pageResult) => {
|
|
44
|
+
await this.options.onBlocked({ ...pageResult });
|
|
45
|
+
}
|
|
46
|
+
: undefined,
|
|
47
|
+
onError: this.options.onError
|
|
48
|
+
? async (pageResult) => {
|
|
49
|
+
await this.options.onError({ ...pageResult });
|
|
50
|
+
}
|
|
51
|
+
: undefined,
|
|
42
52
|
});
|
|
43
53
|
}
|
|
44
54
|
async analyzePageDuringCrawl(pageResult, html) {
|
package/dist/version.js
CHANGED
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "recker",
|
|
3
|
-
"version": "1.0.93-next.
|
|
3
|
+
"version": "1.0.93-next.c2e60bf",
|
|
4
4
|
"description": "Multi-Protocol SDK for the AI Era - HTTP, WebSocket, DNS, FTP, SFTP, Telnet, HLS unified with AI providers and MCP tools",
|
|
5
5
|
"main": "./dist/index.js",
|
|
6
6
|
"types": "./dist/index.d.ts",
|