recker 1.0.93-next.c2e60bf → 1.0.93-next.ecac5c4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/browser/scrape/spider.d.ts +0 -17
- package/dist/browser/scrape/spider.js +0 -25
- package/dist/browser/seo/seo-spider.d.ts +0 -2
- package/dist/browser/seo/seo-spider.js +0 -10
- package/dist/scrape/spider.d.ts +0 -17
- package/dist/scrape/spider.js +0 -25
- package/dist/seo/seo-spider.d.ts +0 -2
- package/dist/seo/seo-spider.js +0 -10
- package/dist/version.js +1 -1
- package/package.json +1 -1
|
@@ -40,23 +40,6 @@ export interface SpiderOptions {
|
|
|
40
40
|
provider?: CaptchaProvider;
|
|
41
41
|
usedCurl: boolean;
|
|
42
42
|
}) => void | Promise<void>;
|
|
43
|
-
onBlocked?: (result: SpiderPageResult) => void | Promise<void>;
|
|
44
|
-
onError?: (result: SpiderPageResult) => void | Promise<void>;
|
|
45
|
-
onRetry?: (info: {
|
|
46
|
-
url: string;
|
|
47
|
-
attempt: number;
|
|
48
|
-
maxAttempts: number;
|
|
49
|
-
reason?: string;
|
|
50
|
-
delay: number;
|
|
51
|
-
transport: SpiderTransport;
|
|
52
|
-
previousStatus: number;
|
|
53
|
-
timings?: SpiderPageResult['timings'];
|
|
54
|
-
}) => void | Promise<void>;
|
|
55
|
-
onRedirect?: (info: {
|
|
56
|
-
from: string;
|
|
57
|
-
to: string;
|
|
58
|
-
status: number;
|
|
59
|
-
}) => void | Promise<void>;
|
|
60
43
|
onProgress?: (progress: SpiderProgress) => void;
|
|
61
44
|
extract?: string[] | ExtractionSchema;
|
|
62
45
|
parserOptions?: Partial<ParserOptions>;
|
|
@@ -241,10 +241,6 @@ export class Spider {
|
|
|
241
241
|
onPage: options.onPage,
|
|
242
242
|
onPageWithHtml: options.onPageWithHtml,
|
|
243
243
|
onCaptchaDetected: options.onCaptchaDetected,
|
|
244
|
-
onBlocked: options.onBlocked,
|
|
245
|
-
onError: options.onError,
|
|
246
|
-
onRetry: options.onRetry,
|
|
247
|
-
onRedirect: options.onRedirect,
|
|
248
244
|
onProgress: options.onProgress,
|
|
249
245
|
extract: extractSchema,
|
|
250
246
|
parserOptions: options.parserOptions,
|
|
@@ -615,9 +611,6 @@ export class Spider {
|
|
|
615
611
|
const clientForRequest = this.getClientForProxy(proxyUrl);
|
|
616
612
|
const response = await clientForRequest.get(url, {
|
|
617
613
|
headers: this.buildRequestHeaders(url, false),
|
|
618
|
-
beforeRedirect: this.options.onRedirect
|
|
619
|
-
? (info) => { this.options.onRedirect({ from: info.from, to: info.to, status: info.status }); }
|
|
620
|
-
: undefined,
|
|
621
614
|
});
|
|
622
615
|
const contentType = response.headers.get('content-type') || '';
|
|
623
616
|
const shouldReadUndiciBody = !contentType ||
|
|
@@ -739,18 +732,6 @@ export class Spider {
|
|
|
739
732
|
forcedTransport = 'curl';
|
|
740
733
|
}
|
|
741
734
|
}
|
|
742
|
-
if (this.options.onRetry) {
|
|
743
|
-
await this.options.onRetry({
|
|
744
|
-
url,
|
|
745
|
-
attempt: attempt + 1,
|
|
746
|
-
maxAttempts,
|
|
747
|
-
reason: attemptReason,
|
|
748
|
-
delay: waitMs,
|
|
749
|
-
transport: forcedTransport ?? transportForAttempt,
|
|
750
|
-
previousStatus: response.status,
|
|
751
|
-
timings,
|
|
752
|
-
});
|
|
753
|
-
}
|
|
754
735
|
await sleep(waitMs);
|
|
755
736
|
continue;
|
|
756
737
|
}
|
|
@@ -890,9 +871,6 @@ export class Spider {
|
|
|
890
871
|
if (this.options.onPageWithHtml && html) {
|
|
891
872
|
await this.options.onPageWithHtml(nonHtmlResult, html);
|
|
892
873
|
}
|
|
893
|
-
if (this.options.onBlocked && (detection.blocked || hasCaptcha)) {
|
|
894
|
-
await this.options.onBlocked(nonHtmlResult);
|
|
895
|
-
}
|
|
896
874
|
return;
|
|
897
875
|
}
|
|
898
876
|
const doc = await ScrapeDocument.create(html, {
|
|
@@ -1052,9 +1030,6 @@ export class Spider {
|
|
|
1052
1030
|
this._resultCount++;
|
|
1053
1031
|
await this.crawlStorage.saveError({ url: item.url, error: message });
|
|
1054
1032
|
this.options.onPage?.(errorResult);
|
|
1055
|
-
if (this.options.onError) {
|
|
1056
|
-
await this.options.onError(errorResult);
|
|
1057
|
-
}
|
|
1058
1033
|
}
|
|
1059
1034
|
}
|
|
1060
1035
|
getOrCreateDomainState(hostname) {
|
|
@@ -5,8 +5,6 @@ export interface SeoSpiderOptions extends SpiderOptions {
|
|
|
5
5
|
seo?: boolean;
|
|
6
6
|
output?: string;
|
|
7
7
|
onSeoAnalysis?: (result: SeoPageResult) => void;
|
|
8
|
-
onBlocked?: (result: SeoPageResult) => void | Promise<void>;
|
|
9
|
-
onError?: (result: SeoPageResult) => void | Promise<void>;
|
|
10
8
|
focusCategories?: string[];
|
|
11
9
|
focusMode?: 'all' | 'links' | 'duplicates' | 'security' | 'ai' | 'resources';
|
|
12
10
|
}
|
|
@@ -39,16 +39,6 @@ export class SeoSpider {
|
|
|
39
39
|
await this.analyzePageDuringCrawl(pageResult, html);
|
|
40
40
|
}
|
|
41
41
|
: undefined,
|
|
42
|
-
onBlocked: this.options.onBlocked
|
|
43
|
-
? async (pageResult) => {
|
|
44
|
-
await this.options.onBlocked({ ...pageResult });
|
|
45
|
-
}
|
|
46
|
-
: undefined,
|
|
47
|
-
onError: this.options.onError
|
|
48
|
-
? async (pageResult) => {
|
|
49
|
-
await this.options.onError({ ...pageResult });
|
|
50
|
-
}
|
|
51
|
-
: undefined,
|
|
52
42
|
});
|
|
53
43
|
}
|
|
54
44
|
async analyzePageDuringCrawl(pageResult, html) {
|
package/dist/scrape/spider.d.ts
CHANGED
|
@@ -40,23 +40,6 @@ export interface SpiderOptions {
|
|
|
40
40
|
provider?: CaptchaProvider;
|
|
41
41
|
usedCurl: boolean;
|
|
42
42
|
}) => void | Promise<void>;
|
|
43
|
-
onBlocked?: (result: SpiderPageResult) => void | Promise<void>;
|
|
44
|
-
onError?: (result: SpiderPageResult) => void | Promise<void>;
|
|
45
|
-
onRetry?: (info: {
|
|
46
|
-
url: string;
|
|
47
|
-
attempt: number;
|
|
48
|
-
maxAttempts: number;
|
|
49
|
-
reason?: string;
|
|
50
|
-
delay: number;
|
|
51
|
-
transport: SpiderTransport;
|
|
52
|
-
previousStatus: number;
|
|
53
|
-
timings?: SpiderPageResult['timings'];
|
|
54
|
-
}) => void | Promise<void>;
|
|
55
|
-
onRedirect?: (info: {
|
|
56
|
-
from: string;
|
|
57
|
-
to: string;
|
|
58
|
-
status: number;
|
|
59
|
-
}) => void | Promise<void>;
|
|
60
43
|
onProgress?: (progress: SpiderProgress) => void;
|
|
61
44
|
extract?: string[] | ExtractionSchema;
|
|
62
45
|
parserOptions?: Partial<ParserOptions>;
|
package/dist/scrape/spider.js
CHANGED
|
@@ -241,10 +241,6 @@ export class Spider {
|
|
|
241
241
|
onPage: options.onPage,
|
|
242
242
|
onPageWithHtml: options.onPageWithHtml,
|
|
243
243
|
onCaptchaDetected: options.onCaptchaDetected,
|
|
244
|
-
onBlocked: options.onBlocked,
|
|
245
|
-
onError: options.onError,
|
|
246
|
-
onRetry: options.onRetry,
|
|
247
|
-
onRedirect: options.onRedirect,
|
|
248
244
|
onProgress: options.onProgress,
|
|
249
245
|
extract: extractSchema,
|
|
250
246
|
parserOptions: options.parserOptions,
|
|
@@ -615,9 +611,6 @@ export class Spider {
|
|
|
615
611
|
const clientForRequest = this.getClientForProxy(proxyUrl);
|
|
616
612
|
const response = await clientForRequest.get(url, {
|
|
617
613
|
headers: this.buildRequestHeaders(url, false),
|
|
618
|
-
beforeRedirect: this.options.onRedirect
|
|
619
|
-
? (info) => { this.options.onRedirect({ from: info.from, to: info.to, status: info.status }); }
|
|
620
|
-
: undefined,
|
|
621
614
|
});
|
|
622
615
|
const contentType = response.headers.get('content-type') || '';
|
|
623
616
|
const shouldReadUndiciBody = !contentType ||
|
|
@@ -739,18 +732,6 @@ export class Spider {
|
|
|
739
732
|
forcedTransport = 'curl';
|
|
740
733
|
}
|
|
741
734
|
}
|
|
742
|
-
if (this.options.onRetry) {
|
|
743
|
-
await this.options.onRetry({
|
|
744
|
-
url,
|
|
745
|
-
attempt: attempt + 1,
|
|
746
|
-
maxAttempts,
|
|
747
|
-
reason: attemptReason,
|
|
748
|
-
delay: waitMs,
|
|
749
|
-
transport: forcedTransport ?? transportForAttempt,
|
|
750
|
-
previousStatus: response.status,
|
|
751
|
-
timings,
|
|
752
|
-
});
|
|
753
|
-
}
|
|
754
735
|
await sleep(waitMs);
|
|
755
736
|
continue;
|
|
756
737
|
}
|
|
@@ -890,9 +871,6 @@ export class Spider {
|
|
|
890
871
|
if (this.options.onPageWithHtml && html) {
|
|
891
872
|
await this.options.onPageWithHtml(nonHtmlResult, html);
|
|
892
873
|
}
|
|
893
|
-
if (this.options.onBlocked && (detection.blocked || hasCaptcha)) {
|
|
894
|
-
await this.options.onBlocked(nonHtmlResult);
|
|
895
|
-
}
|
|
896
874
|
return;
|
|
897
875
|
}
|
|
898
876
|
const doc = await ScrapeDocument.create(html, {
|
|
@@ -1052,9 +1030,6 @@ export class Spider {
|
|
|
1052
1030
|
this._resultCount++;
|
|
1053
1031
|
await this.crawlStorage.saveError({ url: item.url, error: message });
|
|
1054
1032
|
this.options.onPage?.(errorResult);
|
|
1055
|
-
if (this.options.onError) {
|
|
1056
|
-
await this.options.onError(errorResult);
|
|
1057
|
-
}
|
|
1058
1033
|
}
|
|
1059
1034
|
}
|
|
1060
1035
|
getOrCreateDomainState(hostname) {
|
package/dist/seo/seo-spider.d.ts
CHANGED
|
@@ -5,8 +5,6 @@ export interface SeoSpiderOptions extends SpiderOptions {
|
|
|
5
5
|
seo?: boolean;
|
|
6
6
|
output?: string;
|
|
7
7
|
onSeoAnalysis?: (result: SeoPageResult) => void;
|
|
8
|
-
onBlocked?: (result: SeoPageResult) => void | Promise<void>;
|
|
9
|
-
onError?: (result: SeoPageResult) => void | Promise<void>;
|
|
10
8
|
focusCategories?: string[];
|
|
11
9
|
focusMode?: 'all' | 'links' | 'duplicates' | 'security' | 'ai' | 'resources';
|
|
12
10
|
}
|
package/dist/seo/seo-spider.js
CHANGED
|
@@ -39,16 +39,6 @@ export class SeoSpider {
|
|
|
39
39
|
await this.analyzePageDuringCrawl(pageResult, html);
|
|
40
40
|
}
|
|
41
41
|
: undefined,
|
|
42
|
-
onBlocked: this.options.onBlocked
|
|
43
|
-
? async (pageResult) => {
|
|
44
|
-
await this.options.onBlocked({ ...pageResult });
|
|
45
|
-
}
|
|
46
|
-
: undefined,
|
|
47
|
-
onError: this.options.onError
|
|
48
|
-
? async (pageResult) => {
|
|
49
|
-
await this.options.onError({ ...pageResult });
|
|
50
|
-
}
|
|
51
|
-
: undefined,
|
|
52
42
|
});
|
|
53
43
|
}
|
|
54
44
|
async analyzePageDuringCrawl(pageResult, html) {
|
package/dist/version.js
CHANGED
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "recker",
|
|
3
|
-
"version": "1.0.93-next.
|
|
3
|
+
"version": "1.0.93-next.ecac5c4",
|
|
4
4
|
"description": "Multi-Protocol SDK for the AI Era - HTTP, WebSocket, DNS, FTP, SFTP, Telnet, HLS unified with AI providers and MCP tools",
|
|
5
5
|
"main": "./dist/index.js",
|
|
6
6
|
"types": "./dist/index.d.ts",
|