recker 1.0.93-next.40856cc → 1.0.93-next.554b0c4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/browser/scrape/index.d.ts +1 -1
- package/dist/browser/scrape/spider.d.ts +24 -2
- package/dist/browser/scrape/spider.js +46 -6
- package/dist/browser/seo/seo-spider.d.ts +2 -0
- package/dist/browser/seo/seo-spider.js +17 -3
- package/dist/cli/commands/spider-runner.js +2 -1
- package/dist/cli/tui/jobs/spider-job.js +2 -1
- package/dist/scrape/index.d.ts +1 -1
- package/dist/scrape/spider.d.ts +24 -2
- package/dist/scrape/spider.js +46 -6
- package/dist/seo/seo-spider.d.ts +2 -0
- package/dist/seo/seo-spider.js +17 -3
- package/dist/version.js +1 -1
- package/package.json +1 -1
|
@@ -3,7 +3,7 @@ export type { Options as ParserOptions } from './parser/index.js';
|
|
|
3
3
|
export { ScrapeDocument } from './document.js';
|
|
4
4
|
export { ScrapeElement } from './element.js';
|
|
5
5
|
export { Spider, spider } from './spider.js';
|
|
6
|
-
export type { SpiderOptions, SpiderPageResult, SpiderProgress, SpiderResult, } from './spider.js';
|
|
6
|
+
export type { SpiderOptions, SpiderPageResult, SpiderPageEvent, SpiderProgress, SpiderResult, } from './spider.js';
|
|
7
7
|
export { InMemoryCrawlQueue } from './crawl-queue.js';
|
|
8
8
|
export type { CrawlQueueAdapter, CrawlQueueItem } from './crawl-queue.js';
|
|
9
9
|
export { InMemoryCrawlStorage } from './crawl-storage.js';
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import { ScrapeDocument } from './document.js';
|
|
1
2
|
import type { ExtractedLink, ExtractionSchema } from './types.js';
|
|
2
3
|
import type { Options as ParserOptions } from './parser/index.js';
|
|
3
4
|
import { type SitemapUrl } from '../seo/validators/sitemap.js';
|
|
@@ -31,8 +32,7 @@ export interface SpiderOptions {
|
|
|
31
32
|
proxy?: string | string[] | ProxyAdapter;
|
|
32
33
|
transport?: SpiderTransport;
|
|
33
34
|
preferCurlFirst?: boolean;
|
|
34
|
-
onPage?: (
|
|
35
|
-
onPageWithHtml?: (result: SpiderPageResult, html: string) => void | Promise<void>;
|
|
35
|
+
onPage?: (event: SpiderPageEvent) => void | Promise<void>;
|
|
36
36
|
onCaptchaDetected?: (result: {
|
|
37
37
|
url: string;
|
|
38
38
|
status: number;
|
|
@@ -40,6 +40,23 @@ export interface SpiderOptions {
|
|
|
40
40
|
provider?: CaptchaProvider;
|
|
41
41
|
usedCurl: boolean;
|
|
42
42
|
}) => void | Promise<void>;
|
|
43
|
+
onBlocked?: (result: SpiderPageResult) => void | Promise<void>;
|
|
44
|
+
onError?: (result: SpiderPageResult) => void | Promise<void>;
|
|
45
|
+
onRetry?: (info: {
|
|
46
|
+
url: string;
|
|
47
|
+
attempt: number;
|
|
48
|
+
maxAttempts: number;
|
|
49
|
+
reason?: string;
|
|
50
|
+
delay: number;
|
|
51
|
+
transport: SpiderTransport;
|
|
52
|
+
previousStatus: number;
|
|
53
|
+
timings?: SpiderPageResult['timings'];
|
|
54
|
+
}) => void | Promise<void>;
|
|
55
|
+
onRedirect?: (info: {
|
|
56
|
+
from: string;
|
|
57
|
+
to: string;
|
|
58
|
+
status: number;
|
|
59
|
+
}) => void | Promise<void>;
|
|
43
60
|
onProgress?: (progress: SpiderProgress) => void;
|
|
44
61
|
extract?: string[] | ExtractionSchema;
|
|
45
62
|
parserOptions?: Partial<ParserOptions>;
|
|
@@ -103,6 +120,11 @@ export interface SpiderPageResult {
|
|
|
103
120
|
};
|
|
104
121
|
extracted?: Record<string, unknown>;
|
|
105
122
|
}
|
|
123
|
+
export interface SpiderPageEvent {
|
|
124
|
+
result: SpiderPageResult;
|
|
125
|
+
html?: string;
|
|
126
|
+
document?: () => Promise<ScrapeDocument>;
|
|
127
|
+
}
|
|
106
128
|
export interface SpiderProgress {
|
|
107
129
|
crawled: number;
|
|
108
130
|
queued: number;
|
|
@@ -239,8 +239,11 @@ export class Spider {
|
|
|
239
239
|
exclude: options.exclude,
|
|
240
240
|
include: options.include,
|
|
241
241
|
onPage: options.onPage,
|
|
242
|
-
onPageWithHtml: options.onPageWithHtml,
|
|
243
242
|
onCaptchaDetected: options.onCaptchaDetected,
|
|
243
|
+
onBlocked: options.onBlocked,
|
|
244
|
+
onError: options.onError,
|
|
245
|
+
onRetry: options.onRetry,
|
|
246
|
+
onRedirect: options.onRedirect,
|
|
244
247
|
onProgress: options.onProgress,
|
|
245
248
|
extract: extractSchema,
|
|
246
249
|
parserOptions: options.parserOptions,
|
|
@@ -611,6 +614,9 @@ export class Spider {
|
|
|
611
614
|
const clientForRequest = this.getClientForProxy(proxyUrl);
|
|
612
615
|
const response = await clientForRequest.get(url, {
|
|
613
616
|
headers: this.buildRequestHeaders(url, false),
|
|
617
|
+
beforeRedirect: this.options.onRedirect
|
|
618
|
+
? (info) => { this.options.onRedirect({ from: info.from, to: info.to, status: info.status }); }
|
|
619
|
+
: undefined,
|
|
614
620
|
});
|
|
615
621
|
const contentType = response.headers.get('content-type') || '';
|
|
616
622
|
const shouldReadUndiciBody = !contentType ||
|
|
@@ -732,6 +738,18 @@ export class Spider {
|
|
|
732
738
|
forcedTransport = 'curl';
|
|
733
739
|
}
|
|
734
740
|
}
|
|
741
|
+
if (this.options.onRetry) {
|
|
742
|
+
await this.options.onRetry({
|
|
743
|
+
url,
|
|
744
|
+
attempt: attempt + 1,
|
|
745
|
+
maxAttempts,
|
|
746
|
+
reason: attemptReason,
|
|
747
|
+
delay: waitMs,
|
|
748
|
+
transport: forcedTransport ?? transportForAttempt,
|
|
749
|
+
previousStatus: response.status,
|
|
750
|
+
timings,
|
|
751
|
+
});
|
|
752
|
+
}
|
|
735
753
|
await sleep(waitMs);
|
|
736
754
|
continue;
|
|
737
755
|
}
|
|
@@ -867,7 +885,21 @@ export class Spider {
|
|
|
867
885
|
};
|
|
868
886
|
await this.crawlStorage.saveResult(nonHtmlResult);
|
|
869
887
|
this._resultCount++;
|
|
870
|
-
this.options.onPage
|
|
888
|
+
if (this.options.onPage) {
|
|
889
|
+
let cachedDoc = null;
|
|
890
|
+
await this.options.onPage({
|
|
891
|
+
result: nonHtmlResult,
|
|
892
|
+
html: html || undefined,
|
|
893
|
+
document: html ? () => {
|
|
894
|
+
if (cachedDoc)
|
|
895
|
+
return Promise.resolve(cachedDoc);
|
|
896
|
+
return ScrapeDocument.create(html, { baseUrl: item.url, parserOptions: this.options.parserOptions }).then(d => { cachedDoc = d; return d; });
|
|
897
|
+
} : undefined,
|
|
898
|
+
});
|
|
899
|
+
}
|
|
900
|
+
if (this.options.onBlocked && (detection.blocked || hasCaptcha)) {
|
|
901
|
+
await this.options.onBlocked(nonHtmlResult);
|
|
902
|
+
}
|
|
871
903
|
return;
|
|
872
904
|
}
|
|
873
905
|
const doc = await ScrapeDocument.create(html, {
|
|
@@ -929,9 +961,12 @@ export class Spider {
|
|
|
929
961
|
};
|
|
930
962
|
await this.crawlStorage.saveResult(result);
|
|
931
963
|
this._resultCount++;
|
|
932
|
-
this.options.onPage
|
|
933
|
-
|
|
934
|
-
|
|
964
|
+
if (this.options.onPage) {
|
|
965
|
+
await this.options.onPage({
|
|
966
|
+
result,
|
|
967
|
+
html,
|
|
968
|
+
document: () => Promise.resolve(doc),
|
|
969
|
+
});
|
|
935
970
|
}
|
|
936
971
|
const candidates = [];
|
|
937
972
|
const candidateUrls = [];
|
|
@@ -1026,7 +1061,12 @@ export class Spider {
|
|
|
1026
1061
|
await this.crawlStorage.saveResult(errorResult);
|
|
1027
1062
|
this._resultCount++;
|
|
1028
1063
|
await this.crawlStorage.saveError({ url: item.url, error: message });
|
|
1029
|
-
this.options.onPage
|
|
1064
|
+
if (this.options.onPage) {
|
|
1065
|
+
await this.options.onPage({ result: errorResult });
|
|
1066
|
+
}
|
|
1067
|
+
if (this.options.onError) {
|
|
1068
|
+
await this.options.onError(errorResult);
|
|
1069
|
+
}
|
|
1030
1070
|
}
|
|
1031
1071
|
}
|
|
1032
1072
|
getOrCreateDomainState(hostname) {
|
|
@@ -5,6 +5,8 @@ export interface SeoSpiderOptions extends SpiderOptions {
|
|
|
5
5
|
seo?: boolean;
|
|
6
6
|
output?: string;
|
|
7
7
|
onSeoAnalysis?: (result: SeoPageResult) => void;
|
|
8
|
+
onBlocked?: (result: SeoPageResult) => void | Promise<void>;
|
|
9
|
+
onError?: (result: SeoPageResult) => void | Promise<void>;
|
|
8
10
|
focusCategories?: string[];
|
|
9
11
|
focusMode?: 'all' | 'links' | 'duplicates' | 'security' | 'ai' | 'resources';
|
|
10
12
|
}
|
|
@@ -32,11 +32,25 @@ export class SeoSpider {
|
|
|
32
32
|
}
|
|
33
33
|
constructor(options = {}) {
|
|
34
34
|
this.options = options;
|
|
35
|
+
const userOnPage = options.onPage;
|
|
35
36
|
this.spider = new Spider({
|
|
36
37
|
...options,
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
await this.analyzePageDuringCrawl(
|
|
38
|
+
onPage: async (event) => {
|
|
39
|
+
if (this.options.seo && event.html) {
|
|
40
|
+
await this.analyzePageDuringCrawl(event.result, event.html);
|
|
41
|
+
}
|
|
42
|
+
if (userOnPage) {
|
|
43
|
+
await userOnPage(event);
|
|
44
|
+
}
|
|
45
|
+
},
|
|
46
|
+
onBlocked: this.options.onBlocked
|
|
47
|
+
? async (pageResult) => {
|
|
48
|
+
await this.options.onBlocked({ ...pageResult });
|
|
49
|
+
}
|
|
50
|
+
: undefined,
|
|
51
|
+
onError: this.options.onError
|
|
52
|
+
? async (pageResult) => {
|
|
53
|
+
await this.options.onError({ ...pageResult });
|
|
40
54
|
}
|
|
41
55
|
: undefined,
|
|
42
56
|
});
|
|
@@ -102,7 +102,8 @@ export class SpiderRunner extends CommandEmitter {
|
|
|
102
102
|
extract,
|
|
103
103
|
include: include?.map(p => new RegExp(p)),
|
|
104
104
|
exclude: exclude?.map(p => new RegExp(p)),
|
|
105
|
-
onPage: (
|
|
105
|
+
onPage: (event) => {
|
|
106
|
+
const page = event.result;
|
|
106
107
|
collectPageMetrics(page);
|
|
107
108
|
pages.push({
|
|
108
109
|
url: page.url,
|
package/dist/scrape/index.d.ts
CHANGED
|
@@ -3,7 +3,7 @@ export type { Options as ParserOptions } from './parser/index.js';
|
|
|
3
3
|
export { ScrapeDocument } from './document.js';
|
|
4
4
|
export { ScrapeElement } from './element.js';
|
|
5
5
|
export { Spider, spider } from './spider.js';
|
|
6
|
-
export type { SpiderOptions, SpiderPageResult, SpiderProgress, SpiderResult, } from './spider.js';
|
|
6
|
+
export type { SpiderOptions, SpiderPageResult, SpiderPageEvent, SpiderProgress, SpiderResult, } from './spider.js';
|
|
7
7
|
export { InMemoryCrawlQueue } from './crawl-queue.js';
|
|
8
8
|
export type { CrawlQueueAdapter, CrawlQueueItem } from './crawl-queue.js';
|
|
9
9
|
export { InMemoryCrawlStorage } from './crawl-storage.js';
|
package/dist/scrape/spider.d.ts
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import { ScrapeDocument } from './document.js';
|
|
1
2
|
import type { ExtractedLink, ExtractionSchema } from './types.js';
|
|
2
3
|
import type { Options as ParserOptions } from './parser/index.js';
|
|
3
4
|
import { type SitemapUrl } from '../seo/validators/sitemap.js';
|
|
@@ -31,8 +32,7 @@ export interface SpiderOptions {
|
|
|
31
32
|
proxy?: string | string[] | ProxyAdapter;
|
|
32
33
|
transport?: SpiderTransport;
|
|
33
34
|
preferCurlFirst?: boolean;
|
|
34
|
-
onPage?: (
|
|
35
|
-
onPageWithHtml?: (result: SpiderPageResult, html: string) => void | Promise<void>;
|
|
35
|
+
onPage?: (event: SpiderPageEvent) => void | Promise<void>;
|
|
36
36
|
onCaptchaDetected?: (result: {
|
|
37
37
|
url: string;
|
|
38
38
|
status: number;
|
|
@@ -40,6 +40,23 @@ export interface SpiderOptions {
|
|
|
40
40
|
provider?: CaptchaProvider;
|
|
41
41
|
usedCurl: boolean;
|
|
42
42
|
}) => void | Promise<void>;
|
|
43
|
+
onBlocked?: (result: SpiderPageResult) => void | Promise<void>;
|
|
44
|
+
onError?: (result: SpiderPageResult) => void | Promise<void>;
|
|
45
|
+
onRetry?: (info: {
|
|
46
|
+
url: string;
|
|
47
|
+
attempt: number;
|
|
48
|
+
maxAttempts: number;
|
|
49
|
+
reason?: string;
|
|
50
|
+
delay: number;
|
|
51
|
+
transport: SpiderTransport;
|
|
52
|
+
previousStatus: number;
|
|
53
|
+
timings?: SpiderPageResult['timings'];
|
|
54
|
+
}) => void | Promise<void>;
|
|
55
|
+
onRedirect?: (info: {
|
|
56
|
+
from: string;
|
|
57
|
+
to: string;
|
|
58
|
+
status: number;
|
|
59
|
+
}) => void | Promise<void>;
|
|
43
60
|
onProgress?: (progress: SpiderProgress) => void;
|
|
44
61
|
extract?: string[] | ExtractionSchema;
|
|
45
62
|
parserOptions?: Partial<ParserOptions>;
|
|
@@ -103,6 +120,11 @@ export interface SpiderPageResult {
|
|
|
103
120
|
};
|
|
104
121
|
extracted?: Record<string, unknown>;
|
|
105
122
|
}
|
|
123
|
+
export interface SpiderPageEvent {
|
|
124
|
+
result: SpiderPageResult;
|
|
125
|
+
html?: string;
|
|
126
|
+
document?: () => Promise<ScrapeDocument>;
|
|
127
|
+
}
|
|
106
128
|
export interface SpiderProgress {
|
|
107
129
|
crawled: number;
|
|
108
130
|
queued: number;
|
package/dist/scrape/spider.js
CHANGED
|
@@ -239,8 +239,11 @@ export class Spider {
|
|
|
239
239
|
exclude: options.exclude,
|
|
240
240
|
include: options.include,
|
|
241
241
|
onPage: options.onPage,
|
|
242
|
-
onPageWithHtml: options.onPageWithHtml,
|
|
243
242
|
onCaptchaDetected: options.onCaptchaDetected,
|
|
243
|
+
onBlocked: options.onBlocked,
|
|
244
|
+
onError: options.onError,
|
|
245
|
+
onRetry: options.onRetry,
|
|
246
|
+
onRedirect: options.onRedirect,
|
|
244
247
|
onProgress: options.onProgress,
|
|
245
248
|
extract: extractSchema,
|
|
246
249
|
parserOptions: options.parserOptions,
|
|
@@ -611,6 +614,9 @@ export class Spider {
|
|
|
611
614
|
const clientForRequest = this.getClientForProxy(proxyUrl);
|
|
612
615
|
const response = await clientForRequest.get(url, {
|
|
613
616
|
headers: this.buildRequestHeaders(url, false),
|
|
617
|
+
beforeRedirect: this.options.onRedirect
|
|
618
|
+
? (info) => { this.options.onRedirect({ from: info.from, to: info.to, status: info.status }); }
|
|
619
|
+
: undefined,
|
|
614
620
|
});
|
|
615
621
|
const contentType = response.headers.get('content-type') || '';
|
|
616
622
|
const shouldReadUndiciBody = !contentType ||
|
|
@@ -732,6 +738,18 @@ export class Spider {
|
|
|
732
738
|
forcedTransport = 'curl';
|
|
733
739
|
}
|
|
734
740
|
}
|
|
741
|
+
if (this.options.onRetry) {
|
|
742
|
+
await this.options.onRetry({
|
|
743
|
+
url,
|
|
744
|
+
attempt: attempt + 1,
|
|
745
|
+
maxAttempts,
|
|
746
|
+
reason: attemptReason,
|
|
747
|
+
delay: waitMs,
|
|
748
|
+
transport: forcedTransport ?? transportForAttempt,
|
|
749
|
+
previousStatus: response.status,
|
|
750
|
+
timings,
|
|
751
|
+
});
|
|
752
|
+
}
|
|
735
753
|
await sleep(waitMs);
|
|
736
754
|
continue;
|
|
737
755
|
}
|
|
@@ -867,7 +885,21 @@ export class Spider {
|
|
|
867
885
|
};
|
|
868
886
|
await this.crawlStorage.saveResult(nonHtmlResult);
|
|
869
887
|
this._resultCount++;
|
|
870
|
-
this.options.onPage
|
|
888
|
+
if (this.options.onPage) {
|
|
889
|
+
let cachedDoc = null;
|
|
890
|
+
await this.options.onPage({
|
|
891
|
+
result: nonHtmlResult,
|
|
892
|
+
html: html || undefined,
|
|
893
|
+
document: html ? () => {
|
|
894
|
+
if (cachedDoc)
|
|
895
|
+
return Promise.resolve(cachedDoc);
|
|
896
|
+
return ScrapeDocument.create(html, { baseUrl: item.url, parserOptions: this.options.parserOptions }).then(d => { cachedDoc = d; return d; });
|
|
897
|
+
} : undefined,
|
|
898
|
+
});
|
|
899
|
+
}
|
|
900
|
+
if (this.options.onBlocked && (detection.blocked || hasCaptcha)) {
|
|
901
|
+
await this.options.onBlocked(nonHtmlResult);
|
|
902
|
+
}
|
|
871
903
|
return;
|
|
872
904
|
}
|
|
873
905
|
const doc = await ScrapeDocument.create(html, {
|
|
@@ -929,9 +961,12 @@ export class Spider {
|
|
|
929
961
|
};
|
|
930
962
|
await this.crawlStorage.saveResult(result);
|
|
931
963
|
this._resultCount++;
|
|
932
|
-
this.options.onPage
|
|
933
|
-
|
|
934
|
-
|
|
964
|
+
if (this.options.onPage) {
|
|
965
|
+
await this.options.onPage({
|
|
966
|
+
result,
|
|
967
|
+
html,
|
|
968
|
+
document: () => Promise.resolve(doc),
|
|
969
|
+
});
|
|
935
970
|
}
|
|
936
971
|
const candidates = [];
|
|
937
972
|
const candidateUrls = [];
|
|
@@ -1026,7 +1061,12 @@ export class Spider {
|
|
|
1026
1061
|
await this.crawlStorage.saveResult(errorResult);
|
|
1027
1062
|
this._resultCount++;
|
|
1028
1063
|
await this.crawlStorage.saveError({ url: item.url, error: message });
|
|
1029
|
-
this.options.onPage
|
|
1064
|
+
if (this.options.onPage) {
|
|
1065
|
+
await this.options.onPage({ result: errorResult });
|
|
1066
|
+
}
|
|
1067
|
+
if (this.options.onError) {
|
|
1068
|
+
await this.options.onError(errorResult);
|
|
1069
|
+
}
|
|
1030
1070
|
}
|
|
1031
1071
|
}
|
|
1032
1072
|
getOrCreateDomainState(hostname) {
|
package/dist/seo/seo-spider.d.ts
CHANGED
|
@@ -5,6 +5,8 @@ export interface SeoSpiderOptions extends SpiderOptions {
|
|
|
5
5
|
seo?: boolean;
|
|
6
6
|
output?: string;
|
|
7
7
|
onSeoAnalysis?: (result: SeoPageResult) => void;
|
|
8
|
+
onBlocked?: (result: SeoPageResult) => void | Promise<void>;
|
|
9
|
+
onError?: (result: SeoPageResult) => void | Promise<void>;
|
|
8
10
|
focusCategories?: string[];
|
|
9
11
|
focusMode?: 'all' | 'links' | 'duplicates' | 'security' | 'ai' | 'resources';
|
|
10
12
|
}
|
package/dist/seo/seo-spider.js
CHANGED
|
@@ -32,11 +32,25 @@ export class SeoSpider {
|
|
|
32
32
|
}
|
|
33
33
|
constructor(options = {}) {
|
|
34
34
|
this.options = options;
|
|
35
|
+
const userOnPage = options.onPage;
|
|
35
36
|
this.spider = new Spider({
|
|
36
37
|
...options,
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
await this.analyzePageDuringCrawl(
|
|
38
|
+
onPage: async (event) => {
|
|
39
|
+
if (this.options.seo && event.html) {
|
|
40
|
+
await this.analyzePageDuringCrawl(event.result, event.html);
|
|
41
|
+
}
|
|
42
|
+
if (userOnPage) {
|
|
43
|
+
await userOnPage(event);
|
|
44
|
+
}
|
|
45
|
+
},
|
|
46
|
+
onBlocked: this.options.onBlocked
|
|
47
|
+
? async (pageResult) => {
|
|
48
|
+
await this.options.onBlocked({ ...pageResult });
|
|
49
|
+
}
|
|
50
|
+
: undefined,
|
|
51
|
+
onError: this.options.onError
|
|
52
|
+
? async (pageResult) => {
|
|
53
|
+
await this.options.onError({ ...pageResult });
|
|
40
54
|
}
|
|
41
55
|
: undefined,
|
|
42
56
|
});
|
package/dist/version.js
CHANGED
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "recker",
|
|
3
|
-
"version": "1.0.93-next.
|
|
3
|
+
"version": "1.0.93-next.554b0c4",
|
|
4
4
|
"description": "Multi-Protocol SDK for the AI Era - HTTP, WebSocket, DNS, FTP, SFTP, Telnet, HLS unified with AI providers and MCP tools",
|
|
5
5
|
"main": "./dist/index.js",
|
|
6
6
|
"types": "./dist/index.d.ts",
|