recker 1.0.102 → 1.0.103-next.8501b56
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/browser/core/errors.d.ts +2 -2
- package/dist/browser/scrape/crawl-queue.d.ts +2 -0
- package/dist/browser/scrape/domain-stats.d.ts +25 -0
- package/dist/browser/scrape/domain-stats.js +47 -0
- package/dist/browser/scrape/errors.d.ts +82 -0
- package/dist/browser/scrape/errors.js +138 -0
- package/dist/browser/scrape/index.d.ts +5 -0
- package/dist/browser/scrape/index.js +3 -0
- package/dist/browser/scrape/rewrite-url.d.ts +6 -0
- package/dist/browser/scrape/rewrite-url.js +77 -0
- package/dist/browser/scrape/spider.d.ts +13 -0
- package/dist/browser/scrape/spider.js +260 -40
- package/dist/browser/scrape/sqlite-crawl-storage.d.ts +1 -1
- package/dist/clients/index.d.ts +1 -1
- package/dist/clients/index.js +1 -1
- package/dist/clients/reddb-proto.d.ts +1 -0
- package/dist/clients/reddb-proto.js +195 -0
- package/dist/clients/reddb.d.ts +306 -71
- package/dist/clients/reddb.js +1410 -324
- package/dist/core/errors.d.ts +2 -2
- package/dist/protocols/index.d.ts +1 -1
- package/dist/protocols/index.js +1 -1
- package/dist/scrape/crawl-queue.d.ts +2 -0
- package/dist/scrape/domain-stats.d.ts +25 -0
- package/dist/scrape/domain-stats.js +47 -0
- package/dist/scrape/errors.d.ts +82 -0
- package/dist/scrape/errors.js +138 -0
- package/dist/scrape/index.d.ts +5 -0
- package/dist/scrape/index.js +3 -0
- package/dist/scrape/rewrite-url.d.ts +6 -0
- package/dist/scrape/rewrite-url.js +77 -0
- package/dist/scrape/spider.d.ts +13 -0
- package/dist/scrape/spider.js +260 -40
- package/dist/scrape/sqlite-crawl-storage.d.ts +1 -1
- package/dist/version.js +1 -1
- package/package.json +4 -1
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import { ReckerRequest, ReckerResponse } from '../types/index.js';
|
|
2
|
-
export type CanonicalErrorCategory = 'http' | 'network' | 'timeout' | 'protocol' | 'validation' | 'state' | 'filesystem' | 'resource' | 'policy' | 'queue' | 'unknown';
|
|
3
|
-
export type ErrorSource = 'client' | 'transport' | 'server' | 'upstream';
|
|
2
|
+
export type CanonicalErrorCategory = 'http' | 'network' | 'timeout' | 'protocol' | 'validation' | 'state' | 'filesystem' | 'resource' | 'policy' | 'queue' | 'scrape' | 'unknown';
|
|
3
|
+
export type ErrorSource = 'client' | 'transport' | 'server' | 'upstream' | 'spider';
|
|
4
4
|
export type CanonicalErrorSeverity = 'low' | 'medium' | 'high';
|
|
5
5
|
export interface CanonicalErrorMetadata {
|
|
6
6
|
category: CanonicalErrorCategory;
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
export interface DomainTransportStats {
|
|
2
|
+
hostname: string;
|
|
3
|
+
curlSuccess: number;
|
|
4
|
+
curlFail: number;
|
|
5
|
+
undiciSuccess: number;
|
|
6
|
+
undiciFail: number;
|
|
7
|
+
lastSuccessTransport?: 'curl' | 'undici';
|
|
8
|
+
preferredTransport?: 'curl' | 'undici';
|
|
9
|
+
lastSeenTs: number;
|
|
10
|
+
}
|
|
11
|
+
export interface DomainStatsAdapter {
|
|
12
|
+
load(hostname: string): Promise<DomainTransportStats | null>;
|
|
13
|
+
loadAll(): Promise<Map<string, DomainTransportStats>>;
|
|
14
|
+
record(hostname: string, transport: 'curl' | 'undici', success: boolean): Promise<void>;
|
|
15
|
+
clear?(): Promise<void>;
|
|
16
|
+
close?(): Promise<void>;
|
|
17
|
+
}
|
|
18
|
+
export declare class InMemoryDomainStats implements DomainStatsAdapter {
|
|
19
|
+
private map;
|
|
20
|
+
load(hostname: string): Promise<DomainTransportStats | null>;
|
|
21
|
+
loadAll(): Promise<Map<string, DomainTransportStats>>;
|
|
22
|
+
record(hostname: string, transport: 'curl' | 'undici', success: boolean): Promise<void>;
|
|
23
|
+
clear(): Promise<void>;
|
|
24
|
+
close(): Promise<void>;
|
|
25
|
+
}
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
export class InMemoryDomainStats {
|
|
2
|
+
map = new Map();
|
|
3
|
+
async load(hostname) {
|
|
4
|
+
const s = this.map.get(hostname);
|
|
5
|
+
return s ? { ...s } : null;
|
|
6
|
+
}
|
|
7
|
+
async loadAll() {
|
|
8
|
+
const copy = new Map();
|
|
9
|
+
for (const [k, v] of this.map) {
|
|
10
|
+
copy.set(k, { ...v });
|
|
11
|
+
}
|
|
12
|
+
return copy;
|
|
13
|
+
}
|
|
14
|
+
async record(hostname, transport, success) {
|
|
15
|
+
let stats = this.map.get(hostname);
|
|
16
|
+
if (!stats) {
|
|
17
|
+
stats = {
|
|
18
|
+
hostname,
|
|
19
|
+
curlSuccess: 0,
|
|
20
|
+
curlFail: 0,
|
|
21
|
+
undiciSuccess: 0,
|
|
22
|
+
undiciFail: 0,
|
|
23
|
+
lastSeenTs: 0,
|
|
24
|
+
};
|
|
25
|
+
this.map.set(hostname, stats);
|
|
26
|
+
}
|
|
27
|
+
if (success) {
|
|
28
|
+
if (transport === 'curl')
|
|
29
|
+
stats.curlSuccess += 1;
|
|
30
|
+
else
|
|
31
|
+
stats.undiciSuccess += 1;
|
|
32
|
+
stats.lastSuccessTransport = transport;
|
|
33
|
+
}
|
|
34
|
+
else {
|
|
35
|
+
if (transport === 'curl')
|
|
36
|
+
stats.curlFail += 1;
|
|
37
|
+
else
|
|
38
|
+
stats.undiciFail += 1;
|
|
39
|
+
}
|
|
40
|
+
stats.lastSeenTs = Date.now();
|
|
41
|
+
}
|
|
42
|
+
async clear() {
|
|
43
|
+
this.map.clear();
|
|
44
|
+
}
|
|
45
|
+
async close() {
|
|
46
|
+
}
|
|
47
|
+
}
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
import { ReckerError } from '../core/errors.js';
|
|
2
|
+
import type { BlockDetectionResult } from '../utils/block-detector.js';
|
|
3
|
+
import type { CaptchaDetectionResult } from '../utils/block-detector.js';
|
|
4
|
+
export type SpiderTransportLabel = 'auto' | 'undici' | 'curl';
|
|
5
|
+
export type CaptchaProviderLabel = CaptchaDetectionResult['provider'];
|
|
6
|
+
export declare class SpiderBlockError extends ReckerError {
|
|
7
|
+
url: string;
|
|
8
|
+
domain: string;
|
|
9
|
+
reason: BlockDetectionResult['reason'];
|
|
10
|
+
confidence: number;
|
|
11
|
+
transport: SpiderTransportLabel;
|
|
12
|
+
preferredTransport?: SpiderTransportLabel;
|
|
13
|
+
constructor(opts: {
|
|
14
|
+
url: string;
|
|
15
|
+
domain: string;
|
|
16
|
+
reason: BlockDetectionResult['reason'];
|
|
17
|
+
confidence: number;
|
|
18
|
+
transport: SpiderTransportLabel;
|
|
19
|
+
preferredTransport?: SpiderTransportLabel;
|
|
20
|
+
statusCode?: number;
|
|
21
|
+
message?: string;
|
|
22
|
+
});
|
|
23
|
+
}
|
|
24
|
+
export declare class SpiderChallengeError extends ReckerError {
|
|
25
|
+
url: string;
|
|
26
|
+
domain: string;
|
|
27
|
+
provider?: CaptchaProviderLabel;
|
|
28
|
+
confidence: number;
|
|
29
|
+
cooldownMs: number;
|
|
30
|
+
transport: SpiderTransportLabel;
|
|
31
|
+
constructor(opts: {
|
|
32
|
+
url: string;
|
|
33
|
+
domain: string;
|
|
34
|
+
provider?: CaptchaProviderLabel;
|
|
35
|
+
confidence: number;
|
|
36
|
+
cooldownMs: number;
|
|
37
|
+
transport: SpiderTransportLabel;
|
|
38
|
+
statusCode?: number;
|
|
39
|
+
});
|
|
40
|
+
}
|
|
41
|
+
export declare class SpiderRobotsDisallowedError extends ReckerError {
|
|
42
|
+
url: string;
|
|
43
|
+
path: string;
|
|
44
|
+
userAgent: string;
|
|
45
|
+
robotsUrl: string;
|
|
46
|
+
constructor(opts: {
|
|
47
|
+
url: string;
|
|
48
|
+
path: string;
|
|
49
|
+
userAgent: string;
|
|
50
|
+
robotsUrl: string;
|
|
51
|
+
});
|
|
52
|
+
}
|
|
53
|
+
export declare class SpiderDepthLimitError extends ReckerError {
|
|
54
|
+
url: string;
|
|
55
|
+
depth: number;
|
|
56
|
+
maxDepth: number;
|
|
57
|
+
constructor(opts: {
|
|
58
|
+
url: string;
|
|
59
|
+
depth: number;
|
|
60
|
+
maxDepth: number;
|
|
61
|
+
});
|
|
62
|
+
}
|
|
63
|
+
export declare class SpiderDomainOutOfScopeError extends ReckerError {
|
|
64
|
+
url: string;
|
|
65
|
+
allowedDomains?: string[];
|
|
66
|
+
constructor(opts: {
|
|
67
|
+
url: string;
|
|
68
|
+
allowedDomains?: string[];
|
|
69
|
+
});
|
|
70
|
+
}
|
|
71
|
+
export declare class SpiderUnsupportedContentError extends ReckerError {
|
|
72
|
+
url: string;
|
|
73
|
+
contentType: string;
|
|
74
|
+
reason: 'binary' | 'pdf' | 'doc' | 'media' | 'archive';
|
|
75
|
+
fallbackSuggestion?: string;
|
|
76
|
+
constructor(opts: {
|
|
77
|
+
url: string;
|
|
78
|
+
contentType: string;
|
|
79
|
+
reason: 'binary' | 'pdf' | 'doc' | 'media' | 'archive';
|
|
80
|
+
fallbackSuggestion?: string;
|
|
81
|
+
});
|
|
82
|
+
}
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
import { ReckerError } from '../core/errors.js';
|
|
2
|
+
export class SpiderBlockError extends ReckerError {
|
|
3
|
+
url;
|
|
4
|
+
domain;
|
|
5
|
+
reason;
|
|
6
|
+
confidence;
|
|
7
|
+
transport;
|
|
8
|
+
preferredTransport;
|
|
9
|
+
constructor(opts) {
|
|
10
|
+
super(opts.message ?? `Request to ${opts.url} was blocked (${opts.reason ?? 'unknown'})`, undefined, undefined, [
|
|
11
|
+
opts.preferredTransport && opts.preferredTransport !== opts.transport
|
|
12
|
+
? `Retry with the ${opts.preferredTransport} transport.`
|
|
13
|
+
: 'Rotate user-agent, proxy, or use curl-impersonate to bypass TLS fingerprinting.',
|
|
14
|
+
'Increase the retry delay and honor any Retry-After header.',
|
|
15
|
+
], true, {
|
|
16
|
+
category: 'scrape',
|
|
17
|
+
source: 'spider',
|
|
18
|
+
severity: 'medium',
|
|
19
|
+
canRetry: true,
|
|
20
|
+
reason: `Blocked: ${opts.reason ?? 'unknown'}`,
|
|
21
|
+
statusCode: opts.statusCode,
|
|
22
|
+
});
|
|
23
|
+
this.name = 'SpiderBlockError';
|
|
24
|
+
this.url = opts.url;
|
|
25
|
+
this.domain = opts.domain;
|
|
26
|
+
this.reason = opts.reason;
|
|
27
|
+
this.confidence = opts.confidence;
|
|
28
|
+
this.transport = opts.transport;
|
|
29
|
+
this.preferredTransport = opts.preferredTransport;
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
export class SpiderChallengeError extends ReckerError {
|
|
33
|
+
url;
|
|
34
|
+
domain;
|
|
35
|
+
provider;
|
|
36
|
+
confidence;
|
|
37
|
+
cooldownMs;
|
|
38
|
+
transport;
|
|
39
|
+
constructor(opts) {
|
|
40
|
+
super(`CAPTCHA challenge detected on ${opts.url}${opts.provider ? ` (${opts.provider})` : ''}`, undefined, undefined, [
|
|
41
|
+
`Cool down the host for at least ${opts.cooldownMs}ms before retrying.`,
|
|
42
|
+
'Consider solving the challenge via an external CAPTCHA-solving service.',
|
|
43
|
+
'Rotate proxy/IP before retrying.',
|
|
44
|
+
], true, {
|
|
45
|
+
category: 'scrape',
|
|
46
|
+
source: 'spider',
|
|
47
|
+
severity: 'high',
|
|
48
|
+
canRetry: true,
|
|
49
|
+
reason: opts.provider ? `Challenge: ${opts.provider}` : 'CAPTCHA challenge',
|
|
50
|
+
statusCode: opts.statusCode,
|
|
51
|
+
retryAfterMs: opts.cooldownMs,
|
|
52
|
+
});
|
|
53
|
+
this.name = 'SpiderChallengeError';
|
|
54
|
+
this.url = opts.url;
|
|
55
|
+
this.domain = opts.domain;
|
|
56
|
+
this.provider = opts.provider;
|
|
57
|
+
this.confidence = opts.confidence;
|
|
58
|
+
this.cooldownMs = opts.cooldownMs;
|
|
59
|
+
this.transport = opts.transport;
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
export class SpiderRobotsDisallowedError extends ReckerError {
|
|
63
|
+
url;
|
|
64
|
+
path;
|
|
65
|
+
userAgent;
|
|
66
|
+
robotsUrl;
|
|
67
|
+
constructor(opts) {
|
|
68
|
+
super(`URL ${opts.url} is disallowed by robots.txt for ${opts.userAgent}`, undefined, undefined, [
|
|
69
|
+
'Do not crawl this URL — the site operator has explicitly excluded it.',
|
|
70
|
+
`Check ${opts.robotsUrl} for the full rules.`,
|
|
71
|
+
], false, {
|
|
72
|
+
category: 'scrape',
|
|
73
|
+
source: 'spider',
|
|
74
|
+
severity: 'low',
|
|
75
|
+
canRetry: false,
|
|
76
|
+
reason: 'Disallowed by robots.txt',
|
|
77
|
+
});
|
|
78
|
+
this.name = 'SpiderRobotsDisallowedError';
|
|
79
|
+
this.url = opts.url;
|
|
80
|
+
this.path = opts.path;
|
|
81
|
+
this.userAgent = opts.userAgent;
|
|
82
|
+
this.robotsUrl = opts.robotsUrl;
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
export class SpiderDepthLimitError extends ReckerError {
|
|
86
|
+
url;
|
|
87
|
+
depth;
|
|
88
|
+
maxDepth;
|
|
89
|
+
constructor(opts) {
|
|
90
|
+
super(`URL ${opts.url} exceeds max crawl depth (${opts.depth} > ${opts.maxDepth})`, undefined, undefined, ['Increase maxDepth if deeper crawls are expected.'], false, {
|
|
91
|
+
category: 'scrape',
|
|
92
|
+
source: 'spider',
|
|
93
|
+
severity: 'low',
|
|
94
|
+
canRetry: false,
|
|
95
|
+
reason: 'Depth limit exceeded',
|
|
96
|
+
});
|
|
97
|
+
this.name = 'SpiderDepthLimitError';
|
|
98
|
+
this.url = opts.url;
|
|
99
|
+
this.depth = opts.depth;
|
|
100
|
+
this.maxDepth = opts.maxDepth;
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
export class SpiderDomainOutOfScopeError extends ReckerError {
|
|
104
|
+
url;
|
|
105
|
+
allowedDomains;
|
|
106
|
+
constructor(opts) {
|
|
107
|
+
super(`URL ${opts.url} is outside the crawl scope`, undefined, undefined, ['Add the host to allowedDomains or disable sameDomain filtering.'], false, {
|
|
108
|
+
category: 'scrape',
|
|
109
|
+
source: 'spider',
|
|
110
|
+
severity: 'low',
|
|
111
|
+
canRetry: false,
|
|
112
|
+
reason: 'Domain out of scope',
|
|
113
|
+
});
|
|
114
|
+
this.name = 'SpiderDomainOutOfScopeError';
|
|
115
|
+
this.url = opts.url;
|
|
116
|
+
this.allowedDomains = opts.allowedDomains;
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
export class SpiderUnsupportedContentError extends ReckerError {
|
|
120
|
+
url;
|
|
121
|
+
contentType;
|
|
122
|
+
reason;
|
|
123
|
+
fallbackSuggestion;
|
|
124
|
+
constructor(opts) {
|
|
125
|
+
super(`Unsupported content type on ${opts.url}: ${opts.contentType}`, undefined, undefined, opts.fallbackSuggestion ? [opts.fallbackSuggestion] : ['Handle this URL with a specialty fetcher or skip.'], false, {
|
|
126
|
+
category: 'scrape',
|
|
127
|
+
source: 'spider',
|
|
128
|
+
severity: 'low',
|
|
129
|
+
canRetry: false,
|
|
130
|
+
reason: `Unsupported content: ${opts.contentType}`,
|
|
131
|
+
});
|
|
132
|
+
this.name = 'SpiderUnsupportedContentError';
|
|
133
|
+
this.url = opts.url;
|
|
134
|
+
this.contentType = opts.contentType;
|
|
135
|
+
this.reason = opts.reason;
|
|
136
|
+
this.fallbackSuggestion = opts.fallbackSuggestion;
|
|
137
|
+
}
|
|
138
|
+
}
|
|
@@ -4,12 +4,17 @@ export { ScrapeDocument } from './document.js';
|
|
|
4
4
|
export { ScrapeElement } from './element.js';
|
|
5
5
|
export { Spider, spider } from './spider.js';
|
|
6
6
|
export type { SpiderOptions, SpiderPageResult, SpiderPageEvent, SpiderProgress, SpiderResult, } from './spider.js';
|
|
7
|
+
export { SpiderBlockError, SpiderChallengeError, SpiderRobotsDisallowedError, SpiderDepthLimitError, SpiderDomainOutOfScopeError, SpiderUnsupportedContentError, } from './errors.js';
|
|
8
|
+
export { rewriteUrl } from './rewrite-url.js';
|
|
9
|
+
export type { UrlRewriteResult } from './rewrite-url.js';
|
|
7
10
|
export { InMemoryCrawlQueue } from './crawl-queue.js';
|
|
8
11
|
export { SqliteCrawlQueue } from './sqlite-crawl-queue.js';
|
|
9
12
|
export type { CrawlQueueAdapter, CrawlQueueItem } from './crawl-queue.js';
|
|
10
13
|
export { InMemoryCrawlStorage } from './crawl-storage.js';
|
|
11
14
|
export { SqliteCrawlStorage } from './sqlite-crawl-storage.js';
|
|
12
15
|
export type { CrawlStorageAdapter } from './crawl-storage.js';
|
|
16
|
+
export { InMemoryDomainStats } from './domain-stats.js';
|
|
17
|
+
export type { DomainStatsAdapter, DomainTransportStats } from './domain-stats.js';
|
|
13
18
|
export { ListProxyAdapter } from './proxy-adapter.js';
|
|
14
19
|
export type { ProxyAdapter } from './proxy-adapter.js';
|
|
15
20
|
export { extractLinks, extractImages, extractMeta, extractOpenGraph, extractTwitterCard, extractJsonLd, extractForms, extractTables, extractScripts, extractStyles, } from './extractors.js';
|
|
@@ -2,9 +2,12 @@ export { parse as parseHtmlSync, HTMLElement, TextNode, CommentNode, Node, NodeT
|
|
|
2
2
|
export { ScrapeDocument } from './document.js';
|
|
3
3
|
export { ScrapeElement } from './element.js';
|
|
4
4
|
export { Spider, spider } from './spider.js';
|
|
5
|
+
export { SpiderBlockError, SpiderChallengeError, SpiderRobotsDisallowedError, SpiderDepthLimitError, SpiderDomainOutOfScopeError, SpiderUnsupportedContentError, } from './errors.js';
|
|
6
|
+
export { rewriteUrl } from './rewrite-url.js';
|
|
5
7
|
export { InMemoryCrawlQueue } from './crawl-queue.js';
|
|
6
8
|
export { SqliteCrawlQueue } from './sqlite-crawl-queue.js';
|
|
7
9
|
export { InMemoryCrawlStorage } from './crawl-storage.js';
|
|
8
10
|
export { SqliteCrawlStorage } from './sqlite-crawl-storage.js';
|
|
11
|
+
export { InMemoryDomainStats } from './domain-stats.js';
|
|
9
12
|
export { ListProxyAdapter } from './proxy-adapter.js';
|
|
10
13
|
export { extractLinks, extractImages, extractMeta, extractOpenGraph, extractTwitterCard, extractJsonLd, extractForms, extractTables, extractScripts, extractStyles, } from './extractors.js';
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
const GOOGLE_DOC_PREFIXES = [
|
|
2
|
+
'https://docs.google.com/document/d/',
|
|
3
|
+
'http://docs.google.com/document/d/',
|
|
4
|
+
];
|
|
5
|
+
const GOOGLE_PRESENTATION_PREFIXES = [
|
|
6
|
+
'https://docs.google.com/presentation/d/',
|
|
7
|
+
'http://docs.google.com/presentation/d/',
|
|
8
|
+
];
|
|
9
|
+
const GOOGLE_SPREADSHEET_PREFIXES = [
|
|
10
|
+
'https://docs.google.com/spreadsheets/d/',
|
|
11
|
+
'http://docs.google.com/spreadsheets/d/',
|
|
12
|
+
];
|
|
13
|
+
const GOOGLE_DRIVE_FILE_PREFIXES = [
|
|
14
|
+
'https://drive.google.com/file/d/',
|
|
15
|
+
'http://drive.google.com/file/d/',
|
|
16
|
+
];
|
|
17
|
+
function startsWithAny(url, prefixes) {
|
|
18
|
+
for (const p of prefixes) {
|
|
19
|
+
if (url.startsWith(p))
|
|
20
|
+
return true;
|
|
21
|
+
}
|
|
22
|
+
return false;
|
|
23
|
+
}
|
|
24
|
+
export function rewriteUrl(input) {
|
|
25
|
+
if (startsWithAny(input, GOOGLE_DOC_PREFIXES)) {
|
|
26
|
+
if (input.includes('/document/d/e/')) {
|
|
27
|
+
return { url: input, rewritten: false };
|
|
28
|
+
}
|
|
29
|
+
const id = input.match(/\/document\/d\/([-\w]+)/)?.[1];
|
|
30
|
+
if (id) {
|
|
31
|
+
return {
|
|
32
|
+
url: `https://docs.google.com/document/d/${id}/export?format=html`,
|
|
33
|
+
rewritten: true,
|
|
34
|
+
reason: 'google-docs-export',
|
|
35
|
+
};
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
if (startsWithAny(input, GOOGLE_PRESENTATION_PREFIXES)) {
|
|
39
|
+
if (input.includes('/presentation/d/e/')) {
|
|
40
|
+
return { url: input, rewritten: false };
|
|
41
|
+
}
|
|
42
|
+
const id = input.match(/\/presentation\/d\/([-\w]+)/)?.[1];
|
|
43
|
+
if (id) {
|
|
44
|
+
return {
|
|
45
|
+
url: `https://docs.google.com/presentation/d/${id}/export?format=html`,
|
|
46
|
+
rewritten: true,
|
|
47
|
+
reason: 'google-slides-export',
|
|
48
|
+
};
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
if (startsWithAny(input, GOOGLE_SPREADSHEET_PREFIXES)) {
|
|
52
|
+
if (input.includes('/spreadsheets/d/e/')) {
|
|
53
|
+
return { url: input, rewritten: false };
|
|
54
|
+
}
|
|
55
|
+
const id = input.match(/\/spreadsheets\/d\/([-\w]+)/)?.[1];
|
|
56
|
+
if (id) {
|
|
57
|
+
const gidMatch = input.match(/[?&#]gid=(\d+)/);
|
|
58
|
+
const gidParam = gidMatch ? `&gid=${gidMatch[1]}` : '';
|
|
59
|
+
return {
|
|
60
|
+
url: `https://docs.google.com/spreadsheets/d/${id}/gviz/tq?tqx=out:html${gidParam}`,
|
|
61
|
+
rewritten: true,
|
|
62
|
+
reason: 'google-sheets-export',
|
|
63
|
+
};
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
if (startsWithAny(input, GOOGLE_DRIVE_FILE_PREFIXES)) {
|
|
67
|
+
const id = input.match(/\/file\/d\/([-\w]+)/)?.[1];
|
|
68
|
+
if (id) {
|
|
69
|
+
return {
|
|
70
|
+
url: `https://drive.google.com/uc?export=download&id=${id}`,
|
|
71
|
+
rewritten: true,
|
|
72
|
+
reason: 'google-drive-download',
|
|
73
|
+
};
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
return { url: input, rewritten: false };
|
|
77
|
+
}
|
|
@@ -5,7 +5,9 @@ import { type SitemapUrl } from '../seo/validators/sitemap.js';
|
|
|
5
5
|
import { type CaptchaDetectionResult, type BlockDetectionResult } from '../utils/block-detector.js';
|
|
6
6
|
import { type CrawlQueueAdapter } from './crawl-queue.js';
|
|
7
7
|
import { type CrawlStorageAdapter } from './crawl-storage.js';
|
|
8
|
+
import type { DomainStatsAdapter } from './domain-stats.js';
|
|
8
9
|
import { type ProxyAdapter } from './proxy-adapter.js';
|
|
10
|
+
import type { ReckerError } from '../core/errors.js';
|
|
9
11
|
export type SpiderTransport = 'auto' | 'undici' | 'curl';
|
|
10
12
|
type CaptchaProvider = CaptchaDetectionResult['provider'];
|
|
11
13
|
export interface SpiderOptions {
|
|
@@ -28,11 +30,15 @@ export interface SpiderOptions {
|
|
|
28
30
|
retryJitterMs?: number;
|
|
29
31
|
maxDomainBlockStrikes?: number;
|
|
30
32
|
respectRobotsTxt?: boolean;
|
|
33
|
+
respectRobotsCrawlDelay?: boolean;
|
|
34
|
+
maxRobotsCrawlDelayMs?: number;
|
|
31
35
|
useSitemap?: boolean;
|
|
32
36
|
sitemapUrl?: string;
|
|
37
|
+
rewriteUrls?: boolean;
|
|
33
38
|
proxy?: string | string[] | ProxyAdapter;
|
|
34
39
|
transport?: SpiderTransport;
|
|
35
40
|
preferCurlFirst?: boolean;
|
|
41
|
+
domainStats?: DomainStatsAdapter;
|
|
36
42
|
onPage?: (event: SpiderPageEvent) => void | Promise<void>;
|
|
37
43
|
onCaptchaDetected?: (result: {
|
|
38
44
|
url: string;
|
|
@@ -134,6 +140,9 @@ export interface SpiderPageResult {
|
|
|
134
140
|
contentHash?: string;
|
|
135
141
|
isDuplicate?: boolean;
|
|
136
142
|
duplicateOf?: string;
|
|
143
|
+
sourceUrl?: string;
|
|
144
|
+
rewriteReason?: string;
|
|
145
|
+
typedError?: ReckerError;
|
|
137
146
|
}
|
|
138
147
|
export interface SpiderPageEvent {
|
|
139
148
|
result: SpiderPageResult;
|
|
@@ -191,6 +200,7 @@ export declare class Spider {
|
|
|
191
200
|
private pool;
|
|
192
201
|
private crawlQueue;
|
|
193
202
|
private crawlStorage;
|
|
203
|
+
private domainStats;
|
|
194
204
|
private proxyAdapter;
|
|
195
205
|
private proxyClients;
|
|
196
206
|
private _visitedCount;
|
|
@@ -209,11 +219,13 @@ export declare class Spider {
|
|
|
209
219
|
private curlTransport;
|
|
210
220
|
private curlAvailable;
|
|
211
221
|
private domainStates;
|
|
222
|
+
private domainStatsWrites;
|
|
212
223
|
private sitemapUrls;
|
|
213
224
|
private sitemapUrlSet;
|
|
214
225
|
private robotsData;
|
|
215
226
|
private sitemapValidation;
|
|
216
227
|
private robotsValidation;
|
|
228
|
+
private baseRobotsCrawlDelayMs;
|
|
217
229
|
private waitForDomainRateLimit;
|
|
218
230
|
private toHeaderRecord;
|
|
219
231
|
constructor(options?: SpiderOptions);
|
|
@@ -227,6 +239,7 @@ export declare class Spider {
|
|
|
227
239
|
private crawlPage;
|
|
228
240
|
private getOrCreateDomainState;
|
|
229
241
|
private recordTransportResult;
|
|
242
|
+
private persistTransportStat;
|
|
230
243
|
private isRetryableStatus;
|
|
231
244
|
private buildRequestHeaders;
|
|
232
245
|
private shouldUseCurlForHost;
|