rezo 1.0.41 → 1.0.43
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/adapters/curl.cjs +143 -32
- package/dist/adapters/curl.js +143 -32
- package/dist/adapters/entries/curl.d.ts +65 -0
- package/dist/adapters/entries/fetch.d.ts +65 -0
- package/dist/adapters/entries/http.d.ts +65 -0
- package/dist/adapters/entries/http2.d.ts +65 -0
- package/dist/adapters/entries/react-native.d.ts +65 -0
- package/dist/adapters/entries/xhr.d.ts +65 -0
- package/dist/adapters/fetch.cjs +98 -12
- package/dist/adapters/fetch.js +98 -12
- package/dist/adapters/http.cjs +26 -14
- package/dist/adapters/http.js +26 -14
- package/dist/adapters/http2.cjs +756 -227
- package/dist/adapters/http2.js +756 -227
- package/dist/adapters/index.cjs +6 -6
- package/dist/adapters/xhr.cjs +94 -2
- package/dist/adapters/xhr.js +94 -2
- package/dist/cache/dns-cache.cjs +5 -3
- package/dist/cache/dns-cache.js +5 -3
- package/dist/cache/file-cacher.cjs +7 -1
- package/dist/cache/file-cacher.js +7 -1
- package/dist/cache/index.cjs +15 -13
- package/dist/cache/index.js +1 -0
- package/dist/cache/navigation-history.cjs +298 -0
- package/dist/cache/navigation-history.js +296 -0
- package/dist/cache/url-store.cjs +7 -1
- package/dist/cache/url-store.js +7 -1
- package/dist/core/rezo.cjs +7 -0
- package/dist/core/rezo.js +7 -0
- package/dist/crawler.d.ts +196 -11
- package/dist/entries/crawler.cjs +5 -5
- package/dist/index.cjs +27 -24
- package/dist/index.d.ts +73 -0
- package/dist/index.js +1 -0
- package/dist/internal/agents/base.cjs +113 -0
- package/dist/internal/agents/base.js +110 -0
- package/dist/internal/agents/http-proxy.cjs +89 -0
- package/dist/internal/agents/http-proxy.js +86 -0
- package/dist/internal/agents/https-proxy.cjs +176 -0
- package/dist/internal/agents/https-proxy.js +173 -0
- package/dist/internal/agents/index.cjs +10 -0
- package/dist/internal/agents/index.js +5 -0
- package/dist/internal/agents/socks-client.cjs +571 -0
- package/dist/internal/agents/socks-client.js +567 -0
- package/dist/internal/agents/socks-proxy.cjs +75 -0
- package/dist/internal/agents/socks-proxy.js +72 -0
- package/dist/platform/browser.d.ts +65 -0
- package/dist/platform/bun.d.ts +65 -0
- package/dist/platform/deno.d.ts +65 -0
- package/dist/platform/node.d.ts +65 -0
- package/dist/platform/react-native.d.ts +65 -0
- package/dist/platform/worker.d.ts +65 -0
- package/dist/plugin/crawler-options.cjs +1 -1
- package/dist/plugin/crawler-options.js +1 -1
- package/dist/plugin/crawler.cjs +192 -1
- package/dist/plugin/crawler.js +192 -1
- package/dist/plugin/index.cjs +36 -36
- package/dist/proxy/index.cjs +18 -16
- package/dist/proxy/index.js +17 -12
- package/dist/queue/index.cjs +8 -8
- package/dist/responses/buildError.cjs +11 -2
- package/dist/responses/buildError.js +11 -2
- package/dist/responses/universal/index.cjs +11 -11
- package/dist/utils/agent-pool.cjs +1 -17
- package/dist/utils/agent-pool.js +1 -17
- package/dist/utils/curl.cjs +317 -0
- package/dist/utils/curl.js +314 -0
- package/package.json +1 -1
|
@@ -0,0 +1,296 @@
|
|
|
1
|
+
import fs from "node:fs";
|
|
2
|
+
import path from "node:path";
|
|
3
|
+
import { createHash } from "node:crypto";
|
|
4
|
+
function detectRuntime() {
|
|
5
|
+
if (typeof globalThis.Bun !== "undefined")
|
|
6
|
+
return "bun";
|
|
7
|
+
if (typeof globalThis.Deno !== "undefined")
|
|
8
|
+
return "deno";
|
|
9
|
+
return "node";
|
|
10
|
+
}
|
|
11
|
+
async function createDatabase(dbPath) {
|
|
12
|
+
const runtime = detectRuntime();
|
|
13
|
+
if (runtime === "bun") {
|
|
14
|
+
const { Database } = await import("bun:sqlite");
|
|
15
|
+
const db = new Database(dbPath);
|
|
16
|
+
return {
|
|
17
|
+
run: (sql, ...params) => db.run(sql, ...params),
|
|
18
|
+
get: (sql, ...params) => db.query(sql).get(...params),
|
|
19
|
+
all: (sql, ...params) => db.query(sql).all(...params),
|
|
20
|
+
close: () => db.close()
|
|
21
|
+
};
|
|
22
|
+
}
|
|
23
|
+
if (runtime === "deno") {
|
|
24
|
+
try {
|
|
25
|
+
const { Database } = await import("node:sqlite");
|
|
26
|
+
const db = new Database(dbPath);
|
|
27
|
+
return {
|
|
28
|
+
run: (sql, ...params) => db.exec(sql, params),
|
|
29
|
+
get: (sql, ...params) => {
|
|
30
|
+
const stmt = db.prepare(sql);
|
|
31
|
+
return stmt.get(...params);
|
|
32
|
+
},
|
|
33
|
+
all: (sql, ...params) => {
|
|
34
|
+
const stmt = db.prepare(sql);
|
|
35
|
+
return stmt.all(...params);
|
|
36
|
+
},
|
|
37
|
+
close: () => db.close()
|
|
38
|
+
};
|
|
39
|
+
} catch {
|
|
40
|
+
throw new Error("Deno SQLite support requires Node.js compatibility mode");
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
const { DatabaseSync } = await import("node:sqlite");
|
|
44
|
+
const db = new DatabaseSync(dbPath);
|
|
45
|
+
return {
|
|
46
|
+
run: (sql, ...params) => {
|
|
47
|
+
const stmt = db.prepare(sql);
|
|
48
|
+
stmt.run(...params);
|
|
49
|
+
},
|
|
50
|
+
get: (sql, ...params) => {
|
|
51
|
+
const stmt = db.prepare(sql);
|
|
52
|
+
return stmt.get(...params);
|
|
53
|
+
},
|
|
54
|
+
all: (sql, ...params) => {
|
|
55
|
+
const stmt = db.prepare(sql);
|
|
56
|
+
return stmt.all(...params);
|
|
57
|
+
},
|
|
58
|
+
close: () => db.close()
|
|
59
|
+
};
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
export class NavigationHistory {
|
|
63
|
+
db = null;
|
|
64
|
+
options;
|
|
65
|
+
storeDir;
|
|
66
|
+
dbPath;
|
|
67
|
+
closed = false;
|
|
68
|
+
initPromise = null;
|
|
69
|
+
constructor(options = {}) {
|
|
70
|
+
this.options = {
|
|
71
|
+
storeDir: options.storeDir || "./navigation-history",
|
|
72
|
+
dbFileName: options.dbFileName || "navigation.db",
|
|
73
|
+
hashUrls: options.hashUrls ?? false
|
|
74
|
+
};
|
|
75
|
+
this.storeDir = path.resolve(this.options.storeDir);
|
|
76
|
+
this.dbPath = path.join(this.storeDir, this.options.dbFileName);
|
|
77
|
+
if (!fs.existsSync(this.storeDir)) {
|
|
78
|
+
fs.mkdirSync(this.storeDir, { recursive: true });
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
static async create(options = {}) {
|
|
82
|
+
const store = new NavigationHistory(options);
|
|
83
|
+
await store.initialize();
|
|
84
|
+
return store;
|
|
85
|
+
}
|
|
86
|
+
async initialize() {
|
|
87
|
+
if (this.initPromise)
|
|
88
|
+
return this.initPromise;
|
|
89
|
+
this.initPromise = (async () => {
|
|
90
|
+
this.db = await createDatabase(this.dbPath);
|
|
91
|
+
this.db.run(`
|
|
92
|
+
CREATE TABLE IF NOT EXISTS sessions (
|
|
93
|
+
sessionId TEXT PRIMARY KEY,
|
|
94
|
+
baseUrl TEXT NOT NULL,
|
|
95
|
+
startedAt INTEGER NOT NULL,
|
|
96
|
+
lastActivityAt INTEGER NOT NULL,
|
|
97
|
+
status TEXT DEFAULT 'running',
|
|
98
|
+
urlsVisited INTEGER DEFAULT 0,
|
|
99
|
+
urlsQueued INTEGER DEFAULT 0,
|
|
100
|
+
urlsFailed INTEGER DEFAULT 0,
|
|
101
|
+
metadata TEXT
|
|
102
|
+
)
|
|
103
|
+
`);
|
|
104
|
+
this.db.run(`
|
|
105
|
+
CREATE TABLE IF NOT EXISTS queue (
|
|
106
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
107
|
+
sessionId TEXT NOT NULL,
|
|
108
|
+
urlKey TEXT NOT NULL,
|
|
109
|
+
originalUrl TEXT NOT NULL,
|
|
110
|
+
method TEXT DEFAULT 'GET',
|
|
111
|
+
priority INTEGER DEFAULT 0,
|
|
112
|
+
body TEXT,
|
|
113
|
+
headers TEXT,
|
|
114
|
+
metadata TEXT,
|
|
115
|
+
addedAt INTEGER NOT NULL,
|
|
116
|
+
UNIQUE(sessionId, urlKey)
|
|
117
|
+
)
|
|
118
|
+
`);
|
|
119
|
+
this.db.run(`
|
|
120
|
+
CREATE TABLE IF NOT EXISTS visited (
|
|
121
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
122
|
+
sessionId TEXT NOT NULL,
|
|
123
|
+
urlKey TEXT NOT NULL,
|
|
124
|
+
originalUrl TEXT NOT NULL,
|
|
125
|
+
status INTEGER,
|
|
126
|
+
visitedAt INTEGER NOT NULL,
|
|
127
|
+
finalUrl TEXT,
|
|
128
|
+
contentType TEXT,
|
|
129
|
+
errorMessage TEXT,
|
|
130
|
+
UNIQUE(sessionId, urlKey)
|
|
131
|
+
)
|
|
132
|
+
`);
|
|
133
|
+
this.db.run("CREATE INDEX IF NOT EXISTS idx_queue_session ON queue(sessionId)");
|
|
134
|
+
this.db.run("CREATE INDEX IF NOT EXISTS idx_queue_priority ON queue(sessionId, priority DESC)");
|
|
135
|
+
this.db.run("CREATE INDEX IF NOT EXISTS idx_visited_session ON visited(sessionId)");
|
|
136
|
+
this.db.run("CREATE INDEX IF NOT EXISTS idx_sessions_status ON sessions(status)");
|
|
137
|
+
})();
|
|
138
|
+
return this.initPromise;
|
|
139
|
+
}
|
|
140
|
+
getUrlKey(url) {
|
|
141
|
+
if (this.options.hashUrls) {
|
|
142
|
+
return createHash("sha256").update(url).digest("hex");
|
|
143
|
+
}
|
|
144
|
+
return url;
|
|
145
|
+
}
|
|
146
|
+
async createSession(sessionId, baseUrl, metadata) {
|
|
147
|
+
if (this.closed || !this.db)
|
|
148
|
+
throw new Error("NavigationHistory is closed");
|
|
149
|
+
const now = Date.now();
|
|
150
|
+
const session = {
|
|
151
|
+
sessionId,
|
|
152
|
+
baseUrl,
|
|
153
|
+
startedAt: now,
|
|
154
|
+
lastActivityAt: now,
|
|
155
|
+
status: "running",
|
|
156
|
+
urlsVisited: 0,
|
|
157
|
+
urlsQueued: 0,
|
|
158
|
+
urlsFailed: 0,
|
|
159
|
+
metadata: metadata ? JSON.stringify(metadata) : undefined
|
|
160
|
+
};
|
|
161
|
+
this.db.run(`INSERT OR REPLACE INTO sessions (sessionId, baseUrl, startedAt, lastActivityAt, status, urlsVisited, urlsQueued, urlsFailed, metadata)
|
|
162
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`, sessionId, baseUrl, now, now, "running", 0, 0, 0, session.metadata ?? null);
|
|
163
|
+
return session;
|
|
164
|
+
}
|
|
165
|
+
async getSession(sessionId) {
|
|
166
|
+
if (this.closed || !this.db)
|
|
167
|
+
throw new Error("NavigationHistory is closed");
|
|
168
|
+
return this.db.get("SELECT * FROM sessions WHERE sessionId = ?", sessionId);
|
|
169
|
+
}
|
|
170
|
+
async updateSessionStatus(sessionId, status) {
|
|
171
|
+
if (this.closed || !this.db)
|
|
172
|
+
throw new Error("NavigationHistory is closed");
|
|
173
|
+
this.db.run("UPDATE sessions SET status = ?, lastActivityAt = ? WHERE sessionId = ?", status, Date.now(), sessionId);
|
|
174
|
+
}
|
|
175
|
+
async updateSessionStats(sessionId, stats) {
|
|
176
|
+
if (this.closed || !this.db)
|
|
177
|
+
throw new Error("NavigationHistory is closed");
|
|
178
|
+
const updates = ["lastActivityAt = ?"];
|
|
179
|
+
const params = [Date.now()];
|
|
180
|
+
if (stats.urlsVisited !== undefined) {
|
|
181
|
+
updates.push("urlsVisited = ?");
|
|
182
|
+
params.push(stats.urlsVisited);
|
|
183
|
+
}
|
|
184
|
+
if (stats.urlsQueued !== undefined) {
|
|
185
|
+
updates.push("urlsQueued = ?");
|
|
186
|
+
params.push(stats.urlsQueued);
|
|
187
|
+
}
|
|
188
|
+
if (stats.urlsFailed !== undefined) {
|
|
189
|
+
updates.push("urlsFailed = ?");
|
|
190
|
+
params.push(stats.urlsFailed);
|
|
191
|
+
}
|
|
192
|
+
params.push(sessionId);
|
|
193
|
+
this.db.run(`UPDATE sessions SET ${updates.join(", ")} WHERE sessionId = ?`, ...params);
|
|
194
|
+
}
|
|
195
|
+
async addToQueue(sessionId, url, options = {}) {
|
|
196
|
+
if (this.closed || !this.db)
|
|
197
|
+
throw new Error("NavigationHistory is closed");
|
|
198
|
+
const urlKey = this.getUrlKey(url);
|
|
199
|
+
const existing = this.db.get("SELECT id FROM queue WHERE sessionId = ? AND urlKey = ?", sessionId, urlKey);
|
|
200
|
+
if (existing)
|
|
201
|
+
return false;
|
|
202
|
+
const isVisited = this.db.get("SELECT id FROM visited WHERE sessionId = ? AND urlKey = ?", sessionId, urlKey);
|
|
203
|
+
if (isVisited)
|
|
204
|
+
return false;
|
|
205
|
+
this.db.run(`INSERT INTO queue (sessionId, urlKey, originalUrl, method, priority, body, headers, metadata, addedAt)
|
|
206
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`, sessionId, urlKey, url, options.method || "GET", options.priority || 0, options.body ? JSON.stringify(options.body) : null, options.headers ? JSON.stringify(options.headers) : null, options.metadata ? JSON.stringify(options.metadata) : null, Date.now());
|
|
207
|
+
return true;
|
|
208
|
+
}
|
|
209
|
+
async getNextFromQueue(sessionId) {
|
|
210
|
+
if (this.closed || !this.db)
|
|
211
|
+
throw new Error("NavigationHistory is closed");
|
|
212
|
+
const item = this.db.get("SELECT originalUrl as url, method, priority, body, headers, metadata, addedAt FROM queue WHERE sessionId = ? ORDER BY priority DESC, addedAt ASC LIMIT 1", sessionId);
|
|
213
|
+
return item;
|
|
214
|
+
}
|
|
215
|
+
async removeFromQueue(sessionId, url) {
|
|
216
|
+
if (this.closed || !this.db)
|
|
217
|
+
throw new Error("NavigationHistory is closed");
|
|
218
|
+
const urlKey = this.getUrlKey(url);
|
|
219
|
+
this.db.run("DELETE FROM queue WHERE sessionId = ? AND urlKey = ?", sessionId, urlKey);
|
|
220
|
+
return true;
|
|
221
|
+
}
|
|
222
|
+
async getQueueSize(sessionId) {
|
|
223
|
+
if (this.closed || !this.db)
|
|
224
|
+
throw new Error("NavigationHistory is closed");
|
|
225
|
+
const result = this.db.get("SELECT COUNT(*) as count FROM queue WHERE sessionId = ?", sessionId);
|
|
226
|
+
return result?.count || 0;
|
|
227
|
+
}
|
|
228
|
+
async markVisited(sessionId, url, result = {}) {
|
|
229
|
+
if (this.closed || !this.db)
|
|
230
|
+
throw new Error("NavigationHistory is closed");
|
|
231
|
+
const urlKey = this.getUrlKey(url);
|
|
232
|
+
this.db.run("DELETE FROM queue WHERE sessionId = ? AND urlKey = ?", sessionId, urlKey);
|
|
233
|
+
this.db.run(`INSERT OR REPLACE INTO visited (sessionId, urlKey, originalUrl, status, visitedAt, finalUrl, contentType, errorMessage)
|
|
234
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?)`, sessionId, urlKey, url, result.status || 0, Date.now(), result.finalUrl ?? null, result.contentType ?? null, result.errorMessage ?? null);
|
|
235
|
+
}
|
|
236
|
+
async isVisited(sessionId, url) {
|
|
237
|
+
if (this.closed || !this.db)
|
|
238
|
+
throw new Error("NavigationHistory is closed");
|
|
239
|
+
const urlKey = this.getUrlKey(url);
|
|
240
|
+
const result = this.db.get("SELECT id FROM visited WHERE sessionId = ? AND urlKey = ?", sessionId, urlKey);
|
|
241
|
+
return !!result;
|
|
242
|
+
}
|
|
243
|
+
async getVisitedCount(sessionId) {
|
|
244
|
+
if (this.closed || !this.db)
|
|
245
|
+
throw new Error("NavigationHistory is closed");
|
|
246
|
+
const result = this.db.get("SELECT COUNT(*) as count FROM visited WHERE sessionId = ?", sessionId);
|
|
247
|
+
return result?.count || 0;
|
|
248
|
+
}
|
|
249
|
+
async getFailedUrls(sessionId) {
|
|
250
|
+
if (this.closed || !this.db)
|
|
251
|
+
throw new Error("NavigationHistory is closed");
|
|
252
|
+
return this.db.all("SELECT url, status, visitedAt, finalUrl, contentType, errorMessage FROM visited WHERE sessionId = ? AND (status >= 400 OR errorMessage IS NOT NULL)", sessionId);
|
|
253
|
+
}
|
|
254
|
+
async getAllQueuedUrls(sessionId) {
|
|
255
|
+
if (this.closed || !this.db)
|
|
256
|
+
throw new Error("NavigationHistory is closed");
|
|
257
|
+
return this.db.all("SELECT originalUrl as url, method, priority, body, headers, metadata, addedAt FROM queue WHERE sessionId = ? ORDER BY priority DESC, addedAt ASC", sessionId);
|
|
258
|
+
}
|
|
259
|
+
async clearQueue(sessionId) {
|
|
260
|
+
if (this.closed || !this.db)
|
|
261
|
+
throw new Error("NavigationHistory is closed");
|
|
262
|
+
this.db.run("DELETE FROM queue WHERE sessionId = ?", sessionId);
|
|
263
|
+
}
|
|
264
|
+
async clearVisited(sessionId) {
|
|
265
|
+
if (this.closed || !this.db)
|
|
266
|
+
throw new Error("NavigationHistory is closed");
|
|
267
|
+
this.db.run("DELETE FROM visited WHERE sessionId = ?", sessionId);
|
|
268
|
+
}
|
|
269
|
+
async deleteSession(sessionId) {
|
|
270
|
+
if (this.closed || !this.db)
|
|
271
|
+
throw new Error("NavigationHistory is closed");
|
|
272
|
+
this.db.run("DELETE FROM queue WHERE sessionId = ?", sessionId);
|
|
273
|
+
this.db.run("DELETE FROM visited WHERE sessionId = ?", sessionId);
|
|
274
|
+
this.db.run("DELETE FROM sessions WHERE sessionId = ?", sessionId);
|
|
275
|
+
}
|
|
276
|
+
async getResumableSessions() {
|
|
277
|
+
if (this.closed || !this.db)
|
|
278
|
+
throw new Error("NavigationHistory is closed");
|
|
279
|
+
return this.db.all("SELECT * FROM sessions WHERE status IN ('running', 'paused') ORDER BY lastActivityAt DESC");
|
|
280
|
+
}
|
|
281
|
+
async close() {
|
|
282
|
+
if (this.closed)
|
|
283
|
+
return;
|
|
284
|
+
this.closed = true;
|
|
285
|
+
if (this.db) {
|
|
286
|
+
this.db.close();
|
|
287
|
+
this.db = null;
|
|
288
|
+
}
|
|
289
|
+
}
|
|
290
|
+
get isClosed() {
|
|
291
|
+
return this.closed;
|
|
292
|
+
}
|
|
293
|
+
get databasePath() {
|
|
294
|
+
return this.dbPath;
|
|
295
|
+
}
|
|
296
|
+
}
|
package/dist/cache/url-store.cjs
CHANGED
|
@@ -45,7 +45,13 @@ async function createDatabase(dbPath) {
|
|
|
45
45
|
const { DatabaseSync } = await import("node:sqlite");
|
|
46
46
|
const db = new DatabaseSync(dbPath);
|
|
47
47
|
return {
|
|
48
|
-
run: (sql, ...params) =>
|
|
48
|
+
run: (sql, ...params) => {
|
|
49
|
+
if (params.length === 0) {
|
|
50
|
+
db.exec(sql);
|
|
51
|
+
} else {
|
|
52
|
+
db.prepare(sql).run(...params);
|
|
53
|
+
}
|
|
54
|
+
},
|
|
49
55
|
get: (sql, ...params) => {
|
|
50
56
|
const stmt = db.prepare(sql);
|
|
51
57
|
return stmt.get(...params);
|
package/dist/cache/url-store.js
CHANGED
|
@@ -45,7 +45,13 @@ async function createDatabase(dbPath) {
|
|
|
45
45
|
const { DatabaseSync } = await import("node:sqlite");
|
|
46
46
|
const db = new DatabaseSync(dbPath);
|
|
47
47
|
return {
|
|
48
|
-
run: (sql, ...params) =>
|
|
48
|
+
run: (sql, ...params) => {
|
|
49
|
+
if (params.length === 0) {
|
|
50
|
+
db.exec(sql);
|
|
51
|
+
} else {
|
|
52
|
+
db.prepare(sql).run(...params);
|
|
53
|
+
}
|
|
54
|
+
},
|
|
49
55
|
get: (sql, ...params) => {
|
|
50
56
|
const stmt = db.prepare(sql);
|
|
51
57
|
return stmt.get(...params);
|
package/dist/core/rezo.cjs
CHANGED
|
@@ -7,6 +7,7 @@ const packageJson = require("../../package.json");
|
|
|
7
7
|
const { createDefaultHooks, mergeHooks, runVoidHooksSync, runTransformHooks } = require('./hooks.cjs');
|
|
8
8
|
const { ResponseCache, DNSCache } = require('../cache/index.cjs');
|
|
9
9
|
const { ProxyManager } = require('../proxy/manager.cjs');
|
|
10
|
+
const { toCurl: toCurlUtil, fromCurl: fromCurlUtil } = require('../utils/curl.cjs');
|
|
10
11
|
let globalAdapter = null;
|
|
11
12
|
function setGlobalAdapter(adapter) {
|
|
12
13
|
globalAdapter = adapter;
|
|
@@ -479,6 +480,12 @@ class Rezo {
|
|
|
479
480
|
clearCookies() {
|
|
480
481
|
this.jar?.removeAllCookiesSync();
|
|
481
482
|
}
|
|
483
|
+
static toCurl(config) {
|
|
484
|
+
return toCurlUtil(config);
|
|
485
|
+
}
|
|
486
|
+
static fromCurl(curlCommand) {
|
|
487
|
+
return fromCurlUtil(curlCommand);
|
|
488
|
+
}
|
|
482
489
|
}
|
|
483
490
|
const defaultTransforms = exports.defaultTransforms = {
|
|
484
491
|
request: [
|
package/dist/core/rezo.js
CHANGED
|
@@ -7,6 +7,7 @@ import packageJson from "../../package.json" with { type: 'json' };
|
|
|
7
7
|
import { createDefaultHooks, mergeHooks, runVoidHooksSync, runTransformHooks } from './hooks.js';
|
|
8
8
|
import { ResponseCache, DNSCache } from '../cache/index.js';
|
|
9
9
|
import { ProxyManager } from '../proxy/manager.js';
|
|
10
|
+
import { toCurl as toCurlUtil, fromCurl as fromCurlUtil } from '../utils/curl.js';
|
|
10
11
|
let globalAdapter = null;
|
|
11
12
|
export function setGlobalAdapter(adapter) {
|
|
12
13
|
globalAdapter = adapter;
|
|
@@ -479,6 +480,12 @@ export class Rezo {
|
|
|
479
480
|
clearCookies() {
|
|
480
481
|
this.jar?.removeAllCookiesSync();
|
|
481
482
|
}
|
|
483
|
+
static toCurl(config) {
|
|
484
|
+
return toCurlUtil(config);
|
|
485
|
+
}
|
|
486
|
+
static fromCurl(curlCommand) {
|
|
487
|
+
return fromCurlUtil(curlCommand);
|
|
488
|
+
}
|
|
482
489
|
}
|
|
483
490
|
export const defaultTransforms = {
|
|
484
491
|
request: [
|
package/dist/crawler.d.ts
CHANGED
|
@@ -228,6 +228,17 @@ declare class FileCacher {
|
|
|
228
228
|
*/
|
|
229
229
|
get directory(): string;
|
|
230
230
|
}
|
|
231
|
+
export interface CrawlSession {
|
|
232
|
+
sessionId: string;
|
|
233
|
+
baseUrl: string;
|
|
234
|
+
startedAt: number;
|
|
235
|
+
lastActivityAt: number;
|
|
236
|
+
status: "running" | "paused" | "completed" | "failed";
|
|
237
|
+
urlsVisited: number;
|
|
238
|
+
urlsQueued: number;
|
|
239
|
+
urlsFailed: number;
|
|
240
|
+
metadata?: string;
|
|
241
|
+
}
|
|
231
242
|
export interface RezoHttpHeaders {
|
|
232
243
|
accept?: string | undefined;
|
|
233
244
|
"accept-encoding"?: string | undefined;
|
|
@@ -4464,6 +4475,71 @@ declare class Rezo {
|
|
|
4464
4475
|
* @see {@link cookieJar} - Access the underlying RezoCookieJar for more control
|
|
4465
4476
|
*/
|
|
4466
4477
|
clearCookies(): void;
|
|
4478
|
+
/**
|
|
4479
|
+
* Convert a Rezo request configuration to a cURL command string.
|
|
4480
|
+
*
|
|
4481
|
+
* Generates a valid cURL command that can be executed in a terminal to
|
|
4482
|
+
* reproduce the same HTTP request. Useful for:
|
|
4483
|
+
* - Debugging and sharing requests
|
|
4484
|
+
* - Documentation and examples
|
|
4485
|
+
* - Testing requests outside of Node.js
|
|
4486
|
+
* - Exporting requests to other tools
|
|
4487
|
+
*
|
|
4488
|
+
* @param config - Request configuration object
|
|
4489
|
+
* @returns A cURL command string
|
|
4490
|
+
*
|
|
4491
|
+
* @example
|
|
4492
|
+
* ```typescript
|
|
4493
|
+
* const curl = Rezo.toCurl({
|
|
4494
|
+
* url: 'https://api.example.com/users',
|
|
4495
|
+
* method: 'POST',
|
|
4496
|
+
* headers: { 'Content-Type': 'application/json' },
|
|
4497
|
+
* body: { name: 'John', email: 'john@example.com' }
|
|
4498
|
+
* });
|
|
4499
|
+
* // Output: curl -X POST -H 'content-type: application/json' --data-raw '{"name":"John","email":"john@example.com"}' -L --compressed 'https://api.example.com/users'
|
|
4500
|
+
* ```
|
|
4501
|
+
*/
|
|
4502
|
+
static toCurl(config: RezoRequestConfig | RezoRequestOptions): string;
|
|
4503
|
+
/**
|
|
4504
|
+
* Parse a cURL command string into a Rezo request configuration.
|
|
4505
|
+
*
|
|
4506
|
+
* Converts a cURL command into a configuration object that can be
|
|
4507
|
+
* passed directly to Rezo request methods. Useful for:
|
|
4508
|
+
* - Importing requests from browser DevTools
|
|
4509
|
+
* - Converting curl examples from API documentation
|
|
4510
|
+
* - Migrating scripts from curl to Rezo
|
|
4511
|
+
*
|
|
4512
|
+
* Supports common cURL options:
|
|
4513
|
+
* - `-X, --request` - HTTP method
|
|
4514
|
+
* - `-H, --header` - Request headers
|
|
4515
|
+
* - `-d, --data, --data-raw, --data-binary` - Request body
|
|
4516
|
+
* - `-u, --user` - Basic authentication
|
|
4517
|
+
* - `-x, --proxy` - Proxy configuration
|
|
4518
|
+
* - `--socks5, --socks4` - SOCKS proxy
|
|
4519
|
+
* - `-L, --location` - Follow redirects
|
|
4520
|
+
* - `--max-redirs` - Maximum redirects
|
|
4521
|
+
* - `--max-time` - Request timeout
|
|
4522
|
+
* - `-k, --insecure` - Skip TLS verification
|
|
4523
|
+
* - `-A, --user-agent` - User agent header
|
|
4524
|
+
*
|
|
4525
|
+
* @param curlCommand - A cURL command string
|
|
4526
|
+
* @returns A request configuration object
|
|
4527
|
+
*
|
|
4528
|
+
* @example
|
|
4529
|
+
* ```typescript
|
|
4530
|
+
* // From browser DevTools "Copy as cURL"
|
|
4531
|
+
* const config = Rezo.fromCurl(`
|
|
4532
|
+
* curl 'https://api.example.com/data' \\
|
|
4533
|
+
* -H 'Authorization: Bearer token123' \\
|
|
4534
|
+
* -H 'Content-Type: application/json'
|
|
4535
|
+
* `);
|
|
4536
|
+
*
|
|
4537
|
+
* // Use with Rezo
|
|
4538
|
+
* const rezo = new Rezo();
|
|
4539
|
+
* const response = await rezo.request(config);
|
|
4540
|
+
* ```
|
|
4541
|
+
*/
|
|
4542
|
+
static fromCurl(curlCommand: string): RezoRequestOptions;
|
|
4467
4543
|
}
|
|
4468
4544
|
/**
|
|
4469
4545
|
* Rezo HTTP Client - Core Types
|
|
@@ -6284,6 +6360,15 @@ declare class Decodo {
|
|
|
6284
6360
|
* const regexDomain: Domain = '^(sub|api)\.example\.com$';
|
|
6285
6361
|
*/
|
|
6286
6362
|
export type Domain = string[] | string | RegExp;
|
|
6363
|
+
/**
|
|
6364
|
+
* Supported HTTP adapter types for crawler requests
|
|
6365
|
+
* @description
|
|
6366
|
+
* - 'http': Standard Node.js HTTP/HTTPS adapter (default)
|
|
6367
|
+
* - 'http2': HTTP/2 adapter with session pooling
|
|
6368
|
+
* - 'curl': cURL adapter for maximum compatibility
|
|
6369
|
+
* - 'fetch': Browser-compatible Fetch API adapter
|
|
6370
|
+
*/
|
|
6371
|
+
export type CrawlerAdapterType = "http" | "http2" | "curl" | "fetch";
|
|
6287
6372
|
/**
|
|
6288
6373
|
* Configuration interface for the CrawlerOptions class
|
|
6289
6374
|
* @description Defines all available options for configuring web crawler behavior,
|
|
@@ -6292,6 +6377,12 @@ export type Domain = string[] | string | RegExp;
|
|
|
6292
6377
|
export interface ICrawlerOptions {
|
|
6293
6378
|
/** Base URL for the crawler - the starting point for crawling operations */
|
|
6294
6379
|
baseUrl: string;
|
|
6380
|
+
/** HTTP adapter to use for requests (default: 'http') */
|
|
6381
|
+
adapter?: CrawlerAdapterType;
|
|
6382
|
+
/** Enable navigation history for resumable crawling (default: false) */
|
|
6383
|
+
enableNavigationHistory?: boolean;
|
|
6384
|
+
/** Session ID for navigation history - allows resuming specific crawl sessions */
|
|
6385
|
+
sessionId?: string;
|
|
6295
6386
|
/** Whether to reject unauthorized SSL certificates (default: true) */
|
|
6296
6387
|
rejectUnauthorized?: boolean;
|
|
6297
6388
|
/** Custom user agent string for HTTP requests */
|
|
@@ -6415,6 +6506,12 @@ export interface ICrawlerOptions {
|
|
|
6415
6506
|
export declare class CrawlerOptions {
|
|
6416
6507
|
/** Base URL for the crawler - the starting point for crawling operations */
|
|
6417
6508
|
baseUrl: string;
|
|
6509
|
+
/** HTTP adapter to use for requests */
|
|
6510
|
+
adapter: CrawlerAdapterType;
|
|
6511
|
+
/** Enable navigation history for resumable crawling */
|
|
6512
|
+
enableNavigationHistory: boolean;
|
|
6513
|
+
/** Session ID for navigation history - allows resuming specific crawl sessions */
|
|
6514
|
+
sessionId: string;
|
|
6418
6515
|
/** Whether to reject unauthorized SSL certificates */
|
|
6419
6516
|
rejectUnauthorized?: boolean;
|
|
6420
6517
|
/** Custom user agent string for HTTP requests */
|
|
@@ -6886,29 +6983,107 @@ export declare class Crawler {
|
|
|
6886
6983
|
private isStorageReady;
|
|
6887
6984
|
private isCacheReady;
|
|
6888
6985
|
private leadsFinder;
|
|
6986
|
+
/** Navigation history for resumable crawling */
|
|
6987
|
+
private navigationHistory;
|
|
6988
|
+
private isNavigationHistoryReady;
|
|
6989
|
+
private isSessionReady;
|
|
6990
|
+
private currentSession;
|
|
6991
|
+
private navigationHistoryInitPromise;
|
|
6992
|
+
/** Adapter-specific request executor */
|
|
6993
|
+
private adapterExecutor;
|
|
6994
|
+
private adapterType;
|
|
6889
6995
|
/**
|
|
6890
6996
|
* Creates a new Crawler instance with the specified configuration.
|
|
6891
6997
|
*
|
|
6892
|
-
* @param
|
|
6893
|
-
* @param
|
|
6998
|
+
* @param crawlerOptions - Crawler configuration options
|
|
6999
|
+
* @param http - Optional Rezo HTTP client instance (creates default if not provided)
|
|
6894
7000
|
*
|
|
6895
7001
|
* @example
|
|
6896
7002
|
* ```typescript
|
|
7003
|
+
* // Basic usage (creates default Rezo instance)
|
|
6897
7004
|
* const crawler = new Crawler({
|
|
6898
|
-
*
|
|
6899
|
-
* baseUrl: 'https://api.example.com',
|
|
6900
|
-
* timeout: 30000,
|
|
7005
|
+
* baseUrl: 'https://example.com',
|
|
6901
7006
|
* enableCache: true,
|
|
6902
7007
|
* cacheDir: './cache',
|
|
6903
|
-
* socksProxies: [{ host: '127.0.0.1', port: 9050 }]
|
|
6904
|
-
* }, {
|
|
6905
|
-
* http: backupHttpClient,
|
|
6906
|
-
* useProxy: false,
|
|
6907
|
-
* concurrency: 5
|
|
6908
7008
|
* });
|
|
7009
|
+
*
|
|
7010
|
+
* // With resumable crawling
|
|
7011
|
+
* const crawler = new Crawler({
|
|
7012
|
+
* baseUrl: 'https://example.com',
|
|
7013
|
+
* enableNavigationHistory: true,
|
|
7014
|
+
* sessionId: 'my-session',
|
|
7015
|
+
* cacheDir: './cache',
|
|
7016
|
+
* });
|
|
7017
|
+
*
|
|
7018
|
+
* // With custom Rezo instance
|
|
7019
|
+
* const crawler = new Crawler({
|
|
7020
|
+
* baseUrl: 'https://example.com',
|
|
7021
|
+
* adapter: 'curl',
|
|
7022
|
+
* }, myRezoInstance);
|
|
6909
7023
|
* ```
|
|
6910
7024
|
*/
|
|
6911
|
-
constructor(crawlerOptions: ICrawlerOptions, http
|
|
7025
|
+
constructor(crawlerOptions: ICrawlerOptions, http?: Rezo);
|
|
7026
|
+
/**
|
|
7027
|
+
* Initialize the HTTP adapter based on configuration
|
|
7028
|
+
*/
|
|
7029
|
+
private initializeAdapter;
|
|
7030
|
+
/**
|
|
7031
|
+
* Initialize navigation history and session
|
|
7032
|
+
*/
|
|
7033
|
+
private initializeNavigationHistory;
|
|
7034
|
+
/**
|
|
7035
|
+
* Wait for navigation history and session to be ready
|
|
7036
|
+
*/
|
|
7037
|
+
private waitForNavigationHistory;
|
|
7038
|
+
/**
|
|
7039
|
+
* Ensure navigation history is ready and return it (or null if not enabled)
|
|
7040
|
+
* This is used by visit() and other methods that need to write to navigation history
|
|
7041
|
+
*/
|
|
7042
|
+
private ensureNavigationHistoryReady;
|
|
7043
|
+
/**
|
|
7044
|
+
* Add URL to navigation history queue
|
|
7045
|
+
*/
|
|
7046
|
+
private addToNavigationQueue;
|
|
7047
|
+
/**
|
|
7048
|
+
* Mark URL as visited in navigation history
|
|
7049
|
+
*/
|
|
7050
|
+
private markUrlVisited;
|
|
7051
|
+
/**
|
|
7052
|
+
* Get the current crawl session
|
|
7053
|
+
*/
|
|
7054
|
+
getSession(): CrawlSession | null;
|
|
7055
|
+
/**
|
|
7056
|
+
* Get the session ID
|
|
7057
|
+
*/
|
|
7058
|
+
getSessionId(): string;
|
|
7059
|
+
/**
|
|
7060
|
+
* Resume a previous crawl session
|
|
7061
|
+
* @param sessionId - Optional session ID to resume (uses current session if not provided)
|
|
7062
|
+
* @returns Promise resolving to the Crawler instance for chaining
|
|
7063
|
+
*/
|
|
7064
|
+
resume(sessionId?: string): Promise<Crawler>;
|
|
7065
|
+
/**
|
|
7066
|
+
* Get list of resumable sessions
|
|
7067
|
+
* @returns Promise resolving to array of sessions that can be resumed
|
|
7068
|
+
*/
|
|
7069
|
+
getResumableSessions(): Promise<CrawlSession[]>;
|
|
7070
|
+
/**
|
|
7071
|
+
* Pause the current crawl session
|
|
7072
|
+
*/
|
|
7073
|
+
pause(): Promise<void>;
|
|
7074
|
+
/**
|
|
7075
|
+
* Mark the current session as completed
|
|
7076
|
+
*/
|
|
7077
|
+
complete(): Promise<void>;
|
|
7078
|
+
/**
|
|
7079
|
+
* Get the current adapter type being used
|
|
7080
|
+
*/
|
|
7081
|
+
getAdapterType(): CrawlerAdapterType;
|
|
7082
|
+
/**
|
|
7083
|
+
* Switch to a different adapter at runtime
|
|
7084
|
+
* @param adapter - The adapter type to switch to
|
|
7085
|
+
*/
|
|
7086
|
+
setAdapter(adapter: CrawlerAdapterType): Promise<void>;
|
|
6912
7087
|
private rawResponseHandler;
|
|
6913
7088
|
private waitForCache;
|
|
6914
7089
|
private waitForStorage;
|
|
@@ -7303,6 +7478,16 @@ export declare class Crawler {
|
|
|
7303
7478
|
* ```
|
|
7304
7479
|
*/
|
|
7305
7480
|
waitForAll(): Promise<void>;
|
|
7481
|
+
/**
|
|
7482
|
+
* Alias for waitForAll() - waits for all crawling operations to complete.
|
|
7483
|
+
* @returns Promise that resolves when done
|
|
7484
|
+
* @example
|
|
7485
|
+
* ```typescript
|
|
7486
|
+
* crawler.visit('https://example.com');
|
|
7487
|
+
* await crawler.done();
|
|
7488
|
+
* ```
|
|
7489
|
+
*/
|
|
7490
|
+
done(): Promise<void>;
|
|
7306
7491
|
close(): Promise<void>;
|
|
7307
7492
|
}
|
|
7308
7493
|
|
package/dist/entries/crawler.cjs
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
const
|
|
2
|
-
exports.Crawler =
|
|
3
|
-
const
|
|
4
|
-
exports.CrawlerOptions =
|
|
5
|
-
exports.Domain =
|
|
1
|
+
const _mod_l31jyt = require('../plugin/crawler.cjs');
|
|
2
|
+
exports.Crawler = _mod_l31jyt.Crawler;;
|
|
3
|
+
const _mod_2ht78p = require('../plugin/crawler-options.cjs');
|
|
4
|
+
exports.CrawlerOptions = _mod_2ht78p.CrawlerOptions;
|
|
5
|
+
exports.Domain = _mod_2ht78p.Domain;;
|