recker 1.0.86 → 1.0.87
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/browser/core/client.d.ts +2 -0
- package/dist/browser/core/client.js +8 -0
- package/dist/browser/core/request.d.ts +3 -0
- package/dist/browser/core/request.js +6 -2
- package/dist/browser/index.iife.min.js +56 -56
- package/dist/browser/index.min.js +56 -56
- package/dist/browser/index.mini.iife.js +312 -15
- package/dist/browser/index.mini.iife.min.js +38 -38
- package/dist/browser/index.mini.min.js +42 -42
- package/dist/browser/index.mini.umd.js +312 -15
- package/dist/browser/index.mini.umd.min.js +38 -38
- package/dist/browser/index.umd.min.js +56 -56
- package/dist/browser/plugins/queue.d.ts +41 -0
- package/dist/browser/plugins/queue.js +184 -0
- package/dist/browser/scrape/crawl-queue.d.ts +31 -0
- package/dist/browser/scrape/crawl-queue.js +40 -0
- package/dist/browser/scrape/crawl-storage.d.ts +33 -0
- package/dist/browser/scrape/crawl-storage.js +26 -0
- package/dist/browser/scrape/index.d.ts +6 -0
- package/dist/browser/scrape/index.js +3 -0
- package/dist/browser/scrape/proxy-adapter.d.ts +12 -0
- package/dist/browser/scrape/proxy-adapter.js +17 -0
- package/dist/browser/scrape/spider.d.ts +14 -4
- package/dist/browser/scrape/spider.js +119 -45
- package/dist/browser/transport/curl.js +53 -9
- package/dist/browser/transport/undici.js +4 -0
- package/dist/browser/types/index.d.ts +53 -2
- package/dist/core/client.d.ts +2 -0
- package/dist/core/client.js +8 -0
- package/dist/core/request.d.ts +3 -0
- package/dist/core/request.js +6 -2
- package/dist/index.d.ts +2 -0
- package/dist/index.js +2 -0
- package/dist/plugins/queue.d.ts +41 -0
- package/dist/plugins/queue.js +184 -0
- package/dist/queue/consumer.d.ts +17 -0
- package/dist/queue/consumer.js +48 -0
- package/dist/scrape/crawl-queue.d.ts +31 -0
- package/dist/scrape/crawl-queue.js +40 -0
- package/dist/scrape/crawl-storage.d.ts +33 -0
- package/dist/scrape/crawl-storage.js +26 -0
- package/dist/scrape/index.d.ts +6 -0
- package/dist/scrape/index.js +3 -0
- package/dist/scrape/proxy-adapter.d.ts +12 -0
- package/dist/scrape/proxy-adapter.js +17 -0
- package/dist/scrape/spider.d.ts +14 -4
- package/dist/scrape/spider.js +119 -45
- package/dist/transport/curl.js +53 -9
- package/dist/transport/undici.js +4 -0
- package/dist/types/index.d.ts +53 -2
- package/dist/version.js +1 -1
- package/package.json +1 -1
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
import { Plugin, ReckerResponse, QueueAdapter, QueueJob, QueueOptions } from '../types/index.js';
|
|
2
|
+
declare class QueuedResponse<T = unknown> implements ReckerResponse<T> {
|
|
3
|
+
readonly jobId: string;
|
|
4
|
+
readonly raw: Response;
|
|
5
|
+
readonly timings: undefined;
|
|
6
|
+
readonly connection: undefined;
|
|
7
|
+
constructor(jobId: string);
|
|
8
|
+
get status(): number;
|
|
9
|
+
get statusText(): string;
|
|
10
|
+
get headers(): Headers;
|
|
11
|
+
get ok(): boolean;
|
|
12
|
+
get url(): string;
|
|
13
|
+
json<R = T>(): Promise<R>;
|
|
14
|
+
text(): Promise<string>;
|
|
15
|
+
cleanText(): Promise<string>;
|
|
16
|
+
blob(): Promise<Blob>;
|
|
17
|
+
read(): ReadableStream<Uint8Array<ArrayBuffer>> | null;
|
|
18
|
+
clone(): ReckerResponse<T>;
|
|
19
|
+
sse(): AsyncGenerator<never, void, unknown>;
|
|
20
|
+
download(): AsyncGenerator<never, void, unknown>;
|
|
21
|
+
[Symbol.asyncIterator](): AsyncGenerator<Uint8Array<ArrayBuffer>, void, unknown>;
|
|
22
|
+
}
|
|
23
|
+
export declare function queuePlugin(options: QueueOptions): Plugin;
|
|
24
|
+
export declare class InMemoryQueueAdapter implements QueueAdapter {
|
|
25
|
+
private jobs;
|
|
26
|
+
enqueue(job: QueueJob): Promise<string>;
|
|
27
|
+
getStatus(jobId: string): Promise<{
|
|
28
|
+
jobId: string;
|
|
29
|
+
status: "unknown";
|
|
30
|
+
} | {
|
|
31
|
+
jobId: string;
|
|
32
|
+
status: "pending" | "processing" | "completed" | "failed";
|
|
33
|
+
}>;
|
|
34
|
+
getPendingJobs(): QueueJob[];
|
|
35
|
+
setJobStatus(jobId: string, status: string): void;
|
|
36
|
+
getJob(jobId: string): QueueJob | undefined;
|
|
37
|
+
clear(): void;
|
|
38
|
+
get size(): number;
|
|
39
|
+
close(): Promise<void>;
|
|
40
|
+
}
|
|
41
|
+
export { QueuedResponse };
|
|
@@ -0,0 +1,184 @@
|
|
|
1
|
+
let jobSeq = 0;
|
|
2
|
+
function generateJobId() {
|
|
3
|
+
jobSeq += 1;
|
|
4
|
+
return `qj-${Date.now().toString(36)}-${jobSeq.toString(36)}-${Math.random().toString(16).slice(2, 8)}`;
|
|
5
|
+
}
|
|
6
|
+
function serializeRequest(req, jobId, metadata) {
|
|
7
|
+
const headers = {};
|
|
8
|
+
req.headers.forEach((value, key) => {
|
|
9
|
+
headers[key] = value;
|
|
10
|
+
});
|
|
11
|
+
let body = null;
|
|
12
|
+
const bodyContentType = headers['content-type'];
|
|
13
|
+
if (req.body !== null && req.body !== undefined) {
|
|
14
|
+
if (typeof req.body === 'string') {
|
|
15
|
+
body = req.body;
|
|
16
|
+
}
|
|
17
|
+
else if (typeof req.body === 'object') {
|
|
18
|
+
try {
|
|
19
|
+
body = JSON.stringify(req.body);
|
|
20
|
+
}
|
|
21
|
+
catch {
|
|
22
|
+
body = String(req.body);
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
else {
|
|
26
|
+
body = String(req.body);
|
|
27
|
+
}
|
|
28
|
+
}
|
|
29
|
+
return {
|
|
30
|
+
jobId,
|
|
31
|
+
url: req.url,
|
|
32
|
+
method: req.method,
|
|
33
|
+
headers,
|
|
34
|
+
body,
|
|
35
|
+
bodyContentType: bodyContentType || undefined,
|
|
36
|
+
correlationId: req.correlationId,
|
|
37
|
+
traceId: req.traceId,
|
|
38
|
+
tenant: req.tenant,
|
|
39
|
+
policyTags: req.policyTags?.length ? req.policyTags : undefined,
|
|
40
|
+
createdAt: Date.now(),
|
|
41
|
+
metadata: metadata && Object.keys(metadata).length > 0 ? metadata : undefined,
|
|
42
|
+
};
|
|
43
|
+
}
|
|
44
|
+
function compileFilter(config) {
|
|
45
|
+
return (req) => {
|
|
46
|
+
if (config.methods && !config.methods.includes(req.method)) {
|
|
47
|
+
return false;
|
|
48
|
+
}
|
|
49
|
+
if (config.urlPatterns) {
|
|
50
|
+
const matched = config.urlPatterns.some((pattern) => {
|
|
51
|
+
if (typeof pattern === 'string')
|
|
52
|
+
return req.url.includes(pattern);
|
|
53
|
+
return pattern.test(req.url);
|
|
54
|
+
});
|
|
55
|
+
if (!matched)
|
|
56
|
+
return false;
|
|
57
|
+
}
|
|
58
|
+
if (config.headerPresent && !req.headers.has(config.headerPresent)) {
|
|
59
|
+
return false;
|
|
60
|
+
}
|
|
61
|
+
return true;
|
|
62
|
+
};
|
|
63
|
+
}
|
|
64
|
+
class QueuedResponse {
|
|
65
|
+
jobId;
|
|
66
|
+
raw;
|
|
67
|
+
timings = undefined;
|
|
68
|
+
connection = undefined;
|
|
69
|
+
constructor(jobId) {
|
|
70
|
+
this.jobId = jobId;
|
|
71
|
+
const responseBody = JSON.stringify({
|
|
72
|
+
queued: true,
|
|
73
|
+
jobId,
|
|
74
|
+
status: 'queued',
|
|
75
|
+
});
|
|
76
|
+
this.raw = new Response(responseBody, {
|
|
77
|
+
status: 202,
|
|
78
|
+
statusText: 'Accepted',
|
|
79
|
+
headers: {
|
|
80
|
+
'Content-Type': 'application/json',
|
|
81
|
+
'X-Queue-Job-Id': jobId,
|
|
82
|
+
'X-Queue-Status': 'queued',
|
|
83
|
+
},
|
|
84
|
+
});
|
|
85
|
+
}
|
|
86
|
+
get status() { return 202; }
|
|
87
|
+
get statusText() { return 'Accepted'; }
|
|
88
|
+
get headers() { return this.raw.headers; }
|
|
89
|
+
get ok() { return true; }
|
|
90
|
+
get url() { return ''; }
|
|
91
|
+
json() { return this.raw.json(); }
|
|
92
|
+
text() { return this.raw.text(); }
|
|
93
|
+
async cleanText() { return this.raw.text(); }
|
|
94
|
+
blob() { return this.raw.blob(); }
|
|
95
|
+
read() { return this.raw.body; }
|
|
96
|
+
clone() {
|
|
97
|
+
return new QueuedResponse(this.jobId);
|
|
98
|
+
}
|
|
99
|
+
async *sse() { }
|
|
100
|
+
async *download() { }
|
|
101
|
+
async *[Symbol.asyncIterator]() {
|
|
102
|
+
if (!this.raw.body)
|
|
103
|
+
return;
|
|
104
|
+
const reader = this.raw.body.getReader();
|
|
105
|
+
while (true) {
|
|
106
|
+
const { done, value } = await reader.read();
|
|
107
|
+
if (done)
|
|
108
|
+
break;
|
|
109
|
+
yield value;
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
export function queuePlugin(options) {
|
|
114
|
+
const adapter = options.adapter;
|
|
115
|
+
const filter = typeof options.filter === 'function'
|
|
116
|
+
? options.filter
|
|
117
|
+
: options.filter
|
|
118
|
+
? compileFilter(options.filter)
|
|
119
|
+
: () => true;
|
|
120
|
+
const generateId = options.jobIdGenerator || generateJobId;
|
|
121
|
+
const queueMiddleware = async (req, next) => {
|
|
122
|
+
const perReq = req.queue;
|
|
123
|
+
if (perReq === false) {
|
|
124
|
+
return next(req);
|
|
125
|
+
}
|
|
126
|
+
const shouldQueue = perReq === true || typeof perReq === 'object' || filter(req);
|
|
127
|
+
if (!shouldQueue) {
|
|
128
|
+
return next(req);
|
|
129
|
+
}
|
|
130
|
+
const metadata = {
|
|
131
|
+
...options.defaultMetadata,
|
|
132
|
+
...(typeof perReq === 'object' && perReq !== null && 'metadata' in perReq
|
|
133
|
+
? perReq.metadata
|
|
134
|
+
: undefined),
|
|
135
|
+
};
|
|
136
|
+
const jobId = generateId(req);
|
|
137
|
+
const job = serializeRequest(req, jobId, metadata);
|
|
138
|
+
const assignedId = await adapter.enqueue(job);
|
|
139
|
+
return new QueuedResponse(assignedId || jobId);
|
|
140
|
+
};
|
|
141
|
+
return (client) => {
|
|
142
|
+
client.use(queueMiddleware);
|
|
143
|
+
};
|
|
144
|
+
}
|
|
145
|
+
export class InMemoryQueueAdapter {
|
|
146
|
+
jobs = new Map();
|
|
147
|
+
async enqueue(job) {
|
|
148
|
+
this.jobs.set(job.jobId, { ...job, status: 'pending' });
|
|
149
|
+
return job.jobId;
|
|
150
|
+
}
|
|
151
|
+
async getStatus(jobId) {
|
|
152
|
+
const job = this.jobs.get(jobId);
|
|
153
|
+
if (!job)
|
|
154
|
+
return { jobId, status: 'unknown' };
|
|
155
|
+
return { jobId, status: job.status };
|
|
156
|
+
}
|
|
157
|
+
getPendingJobs() {
|
|
158
|
+
return Array.from(this.jobs.values())
|
|
159
|
+
.filter((j) => j.status === 'pending')
|
|
160
|
+
.map(({ status, ...job }) => job);
|
|
161
|
+
}
|
|
162
|
+
setJobStatus(jobId, status) {
|
|
163
|
+
const job = this.jobs.get(jobId);
|
|
164
|
+
if (job)
|
|
165
|
+
job.status = status;
|
|
166
|
+
}
|
|
167
|
+
getJob(jobId) {
|
|
168
|
+
const job = this.jobs.get(jobId);
|
|
169
|
+
if (!job)
|
|
170
|
+
return undefined;
|
|
171
|
+
const { status, ...rest } = job;
|
|
172
|
+
return rest;
|
|
173
|
+
}
|
|
174
|
+
clear() {
|
|
175
|
+
this.jobs.clear();
|
|
176
|
+
}
|
|
177
|
+
get size() {
|
|
178
|
+
return this.jobs.size;
|
|
179
|
+
}
|
|
180
|
+
async close() {
|
|
181
|
+
this.jobs.clear();
|
|
182
|
+
}
|
|
183
|
+
}
|
|
184
|
+
export { QueuedResponse };
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
import type { QueueJob, ReckerResponse } from '../types/index.js';
|
|
2
|
+
export interface ExecuteQueuedRequestOptions {
|
|
3
|
+
client?: {
|
|
4
|
+
request(url: string, options?: any): Promise<ReckerResponse>;
|
|
5
|
+
};
|
|
6
|
+
clientOptions?: Record<string, unknown>;
|
|
7
|
+
timeout?: number;
|
|
8
|
+
}
|
|
9
|
+
export interface QueueExecutionResult {
|
|
10
|
+
jobId: string;
|
|
11
|
+
success: boolean;
|
|
12
|
+
response?: ReckerResponse;
|
|
13
|
+
error?: Error;
|
|
14
|
+
duration: number;
|
|
15
|
+
}
|
|
16
|
+
export declare function executeQueuedRequest(job: QueueJob, options?: ExecuteQueuedRequestOptions): Promise<QueueExecutionResult>;
|
|
17
|
+
export declare function isValidQueueJob(value: unknown): value is QueueJob;
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
export async function executeQueuedRequest(job, options = {}) {
|
|
2
|
+
const startTime = Date.now();
|
|
3
|
+
const resolvedClient = options.client ??
|
|
4
|
+
(await import('../core/client.js').then((m) => m.createClient(options.clientOptions || {})));
|
|
5
|
+
const client = resolvedClient;
|
|
6
|
+
try {
|
|
7
|
+
const requestOptions = {
|
|
8
|
+
method: job.method,
|
|
9
|
+
headers: { ...job.headers },
|
|
10
|
+
body: job.body,
|
|
11
|
+
correlationId: job.correlationId,
|
|
12
|
+
traceId: job.traceId,
|
|
13
|
+
tenant: job.tenant,
|
|
14
|
+
policyTags: job.policyTags,
|
|
15
|
+
throwHttpErrors: false,
|
|
16
|
+
queue: false,
|
|
17
|
+
};
|
|
18
|
+
if (options.timeout) {
|
|
19
|
+
requestOptions.timeout = options.timeout;
|
|
20
|
+
}
|
|
21
|
+
const response = await client.request(job.url, requestOptions);
|
|
22
|
+
return {
|
|
23
|
+
jobId: job.jobId,
|
|
24
|
+
success: response.ok,
|
|
25
|
+
response,
|
|
26
|
+
duration: Date.now() - startTime,
|
|
27
|
+
};
|
|
28
|
+
}
|
|
29
|
+
catch (error) {
|
|
30
|
+
return {
|
|
31
|
+
jobId: job.jobId,
|
|
32
|
+
success: false,
|
|
33
|
+
error: error instanceof Error ? error : new Error(String(error)),
|
|
34
|
+
duration: Date.now() - startTime,
|
|
35
|
+
};
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
export function isValidQueueJob(value) {
|
|
39
|
+
if (!value || typeof value !== 'object')
|
|
40
|
+
return false;
|
|
41
|
+
const obj = value;
|
|
42
|
+
return (typeof obj.jobId === 'string' &&
|
|
43
|
+
typeof obj.url === 'string' &&
|
|
44
|
+
typeof obj.method === 'string' &&
|
|
45
|
+
typeof obj.headers === 'object' &&
|
|
46
|
+
obj.headers !== null &&
|
|
47
|
+
typeof obj.createdAt === 'number');
|
|
48
|
+
}
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
export interface CrawlQueueItem {
|
|
2
|
+
url: string;
|
|
3
|
+
depth: number;
|
|
4
|
+
priority?: number;
|
|
5
|
+
discoveredFrom?: string;
|
|
6
|
+
}
|
|
7
|
+
export interface CrawlQueueAdapter {
|
|
8
|
+
push(item: CrawlQueueItem): Promise<void>;
|
|
9
|
+
pushBatch?(items: CrawlQueueItem[]): Promise<void>;
|
|
10
|
+
pop(): Promise<CrawlQueueItem | null>;
|
|
11
|
+
hasVisited(url: string): Promise<boolean>;
|
|
12
|
+
hasVisitedBatch?(urls: string[]): Promise<Set<string>>;
|
|
13
|
+
markVisited(url: string): Promise<void>;
|
|
14
|
+
size(): Promise<number>;
|
|
15
|
+
clear(): Promise<void>;
|
|
16
|
+
close?(): Promise<void>;
|
|
17
|
+
}
|
|
18
|
+
export declare class InMemoryCrawlQueue implements CrawlQueueAdapter {
|
|
19
|
+
private queue;
|
|
20
|
+
private visited;
|
|
21
|
+
push(item: CrawlQueueItem): Promise<void>;
|
|
22
|
+
pushBatch(items: CrawlQueueItem[]): Promise<void>;
|
|
23
|
+
pop(): Promise<CrawlQueueItem | null>;
|
|
24
|
+
hasVisited(url: string): Promise<boolean>;
|
|
25
|
+
hasVisitedBatch(urls: string[]): Promise<Set<string>>;
|
|
26
|
+
markVisited(url: string): Promise<void>;
|
|
27
|
+
size(): Promise<number>;
|
|
28
|
+
clear(): Promise<void>;
|
|
29
|
+
close(): Promise<void>;
|
|
30
|
+
getVisited(): Set<string>;
|
|
31
|
+
}
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
export class InMemoryCrawlQueue {
|
|
2
|
+
queue = [];
|
|
3
|
+
visited = new Set();
|
|
4
|
+
async push(item) {
|
|
5
|
+
this.queue.push(item);
|
|
6
|
+
}
|
|
7
|
+
async pushBatch(items) {
|
|
8
|
+
this.queue.push(...items);
|
|
9
|
+
}
|
|
10
|
+
async pop() {
|
|
11
|
+
return this.queue.shift() ?? null;
|
|
12
|
+
}
|
|
13
|
+
async hasVisited(url) {
|
|
14
|
+
return this.visited.has(url);
|
|
15
|
+
}
|
|
16
|
+
async hasVisitedBatch(urls) {
|
|
17
|
+
const result = new Set();
|
|
18
|
+
for (const url of urls) {
|
|
19
|
+
if (this.visited.has(url))
|
|
20
|
+
result.add(url);
|
|
21
|
+
}
|
|
22
|
+
return result;
|
|
23
|
+
}
|
|
24
|
+
async markVisited(url) {
|
|
25
|
+
this.visited.add(url);
|
|
26
|
+
}
|
|
27
|
+
async size() {
|
|
28
|
+
return this.queue.length;
|
|
29
|
+
}
|
|
30
|
+
async clear() {
|
|
31
|
+
this.queue = [];
|
|
32
|
+
this.visited.clear();
|
|
33
|
+
}
|
|
34
|
+
async close() {
|
|
35
|
+
await this.clear();
|
|
36
|
+
}
|
|
37
|
+
getVisited() {
|
|
38
|
+
return this.visited;
|
|
39
|
+
}
|
|
40
|
+
}
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
import type { SpiderPageResult } from './spider.js';
|
|
2
|
+
export interface CrawlStorageAdapter {
|
|
3
|
+
saveResult(result: SpiderPageResult): Promise<void>;
|
|
4
|
+
saveError(error: {
|
|
5
|
+
url: string;
|
|
6
|
+
error: string;
|
|
7
|
+
}): Promise<void>;
|
|
8
|
+
getResultCount(): Promise<number>;
|
|
9
|
+
getResults(): Promise<SpiderPageResult[]>;
|
|
10
|
+
getErrors(): Promise<Array<{
|
|
11
|
+
url: string;
|
|
12
|
+
error: string;
|
|
13
|
+
}>>;
|
|
14
|
+
clear(): Promise<void>;
|
|
15
|
+
close?(): Promise<void>;
|
|
16
|
+
}
|
|
17
|
+
export declare class InMemoryCrawlStorage implements CrawlStorageAdapter {
|
|
18
|
+
private results;
|
|
19
|
+
private errors;
|
|
20
|
+
saveResult(result: SpiderPageResult): Promise<void>;
|
|
21
|
+
saveError(error: {
|
|
22
|
+
url: string;
|
|
23
|
+
error: string;
|
|
24
|
+
}): Promise<void>;
|
|
25
|
+
getResultCount(): Promise<number>;
|
|
26
|
+
getResults(): Promise<SpiderPageResult[]>;
|
|
27
|
+
getErrors(): Promise<Array<{
|
|
28
|
+
url: string;
|
|
29
|
+
error: string;
|
|
30
|
+
}>>;
|
|
31
|
+
clear(): Promise<void>;
|
|
32
|
+
close(): Promise<void>;
|
|
33
|
+
}
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
export class InMemoryCrawlStorage {
|
|
2
|
+
results = [];
|
|
3
|
+
errors = [];
|
|
4
|
+
async saveResult(result) {
|
|
5
|
+
this.results.push(result);
|
|
6
|
+
}
|
|
7
|
+
async saveError(error) {
|
|
8
|
+
this.errors.push(error);
|
|
9
|
+
}
|
|
10
|
+
async getResultCount() {
|
|
11
|
+
return this.results.length;
|
|
12
|
+
}
|
|
13
|
+
async getResults() {
|
|
14
|
+
return this.results;
|
|
15
|
+
}
|
|
16
|
+
async getErrors() {
|
|
17
|
+
return this.errors;
|
|
18
|
+
}
|
|
19
|
+
async clear() {
|
|
20
|
+
this.results = [];
|
|
21
|
+
this.errors = [];
|
|
22
|
+
}
|
|
23
|
+
async close() {
|
|
24
|
+
await this.clear();
|
|
25
|
+
}
|
|
26
|
+
}
|
package/dist/scrape/index.d.ts
CHANGED
|
@@ -4,5 +4,11 @@ export { ScrapeDocument } from './document.js';
|
|
|
4
4
|
export { ScrapeElement } from './element.js';
|
|
5
5
|
export { Spider, spider } from './spider.js';
|
|
6
6
|
export type { SpiderOptions, SpiderPageResult, SpiderProgress, SpiderResult, } from './spider.js';
|
|
7
|
+
export { InMemoryCrawlQueue } from './crawl-queue.js';
|
|
8
|
+
export type { CrawlQueueAdapter, CrawlQueueItem } from './crawl-queue.js';
|
|
9
|
+
export { InMemoryCrawlStorage } from './crawl-storage.js';
|
|
10
|
+
export type { CrawlStorageAdapter } from './crawl-storage.js';
|
|
11
|
+
export { ListProxyAdapter } from './proxy-adapter.js';
|
|
12
|
+
export type { ProxyAdapter } from './proxy-adapter.js';
|
|
7
13
|
export { extractLinks, extractImages, extractMeta, extractOpenGraph, extractTwitterCard, extractJsonLd, extractForms, extractTables, extractScripts, extractStyles, } from './extractors.js';
|
|
8
14
|
export type { ExtractedLink, ExtractedImage, ExtractedMeta, OpenGraphData, TwitterCardData, JsonLdData, ExtractedForm, ExtractedFormField, ExtractedTable, ExtractedScript, ExtractedStyle, ExtractionSchema, ExtractionSchemaField, ScrapeOptions, LinkExtractionOptions, ImageExtractionOptions, } from './types.js';
|
package/dist/scrape/index.js
CHANGED
|
@@ -2,4 +2,7 @@ export { parse as parseHtmlSync, HTMLElement, TextNode, CommentNode, Node, NodeT
|
|
|
2
2
|
export { ScrapeDocument } from './document.js';
|
|
3
3
|
export { ScrapeElement } from './element.js';
|
|
4
4
|
export { Spider, spider } from './spider.js';
|
|
5
|
+
export { InMemoryCrawlQueue } from './crawl-queue.js';
|
|
6
|
+
export { InMemoryCrawlStorage } from './crawl-storage.js';
|
|
7
|
+
export { ListProxyAdapter } from './proxy-adapter.js';
|
|
5
8
|
export { extractLinks, extractImages, extractMeta, extractOpenGraph, extractTwitterCard, extractJsonLd, extractForms, extractTables, extractScripts, extractStyles, } from './extractors.js';
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
export interface ProxyAdapter {
|
|
2
|
+
getProxy(): Promise<string | null>;
|
|
3
|
+
reportResult?(proxy: string, success: boolean): Promise<void>;
|
|
4
|
+
close?(): Promise<void>;
|
|
5
|
+
}
|
|
6
|
+
export declare class ListProxyAdapter implements ProxyAdapter {
|
|
7
|
+
private readonly proxies;
|
|
8
|
+
private index;
|
|
9
|
+
constructor(proxies: string[]);
|
|
10
|
+
getProxy(): Promise<string>;
|
|
11
|
+
close(): Promise<void>;
|
|
12
|
+
}
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
export class ListProxyAdapter {
|
|
2
|
+
proxies;
|
|
3
|
+
index = 0;
|
|
4
|
+
constructor(proxies) {
|
|
5
|
+
this.proxies = proxies;
|
|
6
|
+
if (proxies.length === 0) {
|
|
7
|
+
throw new Error('ListProxyAdapter requires at least one proxy');
|
|
8
|
+
}
|
|
9
|
+
}
|
|
10
|
+
async getProxy() {
|
|
11
|
+
const proxy = this.proxies[this.index % this.proxies.length];
|
|
12
|
+
this.index++;
|
|
13
|
+
return proxy;
|
|
14
|
+
}
|
|
15
|
+
async close() {
|
|
16
|
+
}
|
|
17
|
+
}
|
package/dist/scrape/spider.d.ts
CHANGED
|
@@ -2,6 +2,9 @@ import type { ExtractedLink, ExtractionSchema } from './types.js';
|
|
|
2
2
|
import type { Options as ParserOptions } from './parser/index.js';
|
|
3
3
|
import { type SitemapUrl } from '../seo/validators/sitemap.js';
|
|
4
4
|
import { type CaptchaDetectionResult, type BlockDetectionResult } from '../utils/block-detector.js';
|
|
5
|
+
import { type CrawlQueueAdapter } from './crawl-queue.js';
|
|
6
|
+
import { type CrawlStorageAdapter } from './crawl-storage.js';
|
|
7
|
+
import { type ProxyAdapter } from './proxy-adapter.js';
|
|
5
8
|
export type SpiderTransport = 'auto' | 'undici' | 'curl';
|
|
6
9
|
type CaptchaProvider = CaptchaDetectionResult['provider'];
|
|
7
10
|
export interface SpiderOptions {
|
|
@@ -25,6 +28,7 @@ export interface SpiderOptions {
|
|
|
25
28
|
respectRobotsTxt?: boolean;
|
|
26
29
|
useSitemap?: boolean;
|
|
27
30
|
sitemapUrl?: string;
|
|
31
|
+
proxy?: string | string[] | ProxyAdapter;
|
|
28
32
|
transport?: SpiderTransport;
|
|
29
33
|
preferCurlFirst?: boolean;
|
|
30
34
|
onPage?: (result: SpiderPageResult) => void;
|
|
@@ -39,6 +43,8 @@ export interface SpiderOptions {
|
|
|
39
43
|
onProgress?: (progress: SpiderProgress) => void;
|
|
40
44
|
extract?: string[] | ExtractionSchema;
|
|
41
45
|
parserOptions?: Partial<ParserOptions>;
|
|
46
|
+
crawlQueue?: CrawlQueueAdapter;
|
|
47
|
+
crawlStorage?: CrawlStorageAdapter;
|
|
42
48
|
}
|
|
43
49
|
export interface SpiderPageResult {
|
|
44
50
|
url: string;
|
|
@@ -146,10 +152,13 @@ export declare class Spider {
|
|
|
146
152
|
private options;
|
|
147
153
|
private client;
|
|
148
154
|
private pool;
|
|
149
|
-
private
|
|
150
|
-
private
|
|
151
|
-
private
|
|
152
|
-
private
|
|
155
|
+
private crawlQueue;
|
|
156
|
+
private crawlStorage;
|
|
157
|
+
private proxyAdapter;
|
|
158
|
+
private proxyClients;
|
|
159
|
+
private _visitedCount;
|
|
160
|
+
private _queueSize;
|
|
161
|
+
private _resultCount;
|
|
153
162
|
private baseHost;
|
|
154
163
|
private running;
|
|
155
164
|
private aborted;
|
|
@@ -170,6 +179,7 @@ export declare class Spider {
|
|
|
170
179
|
private fetchSitemaps;
|
|
171
180
|
private buildSitemapAnalysis;
|
|
172
181
|
private buildRobotsAnalysis;
|
|
182
|
+
private getClientForProxy;
|
|
173
183
|
private fetchPage;
|
|
174
184
|
private crawlPage;
|
|
175
185
|
private getOrCreateDomainState;
|