imperium-crawl 2.4.0 → 2.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +86 -9
- package/dist/cli.d.ts.map +1 -1
- package/dist/cli.js +23 -3
- package/dist/cli.js.map +1 -1
- package/dist/constants.d.ts +1 -1
- package/dist/constants.d.ts.map +1 -1
- package/dist/constants.js +31 -1
- package/dist/constants.js.map +1 -1
- package/dist/flows/engine.d.ts +7 -0
- package/dist/flows/engine.d.ts.map +1 -0
- package/dist/flows/engine.js +183 -0
- package/dist/flows/engine.js.map +1 -0
- package/dist/flows/index.d.ts +6 -0
- package/dist/flows/index.d.ts.map +1 -0
- package/dist/flows/index.js +6 -0
- package/dist/flows/index.js.map +1 -0
- package/dist/flows/server.d.ts +11 -0
- package/dist/flows/server.d.ts.map +1 -0
- package/dist/flows/server.js +81 -0
- package/dist/flows/server.js.map +1 -0
- package/dist/flows/smart-target.d.ts +9 -0
- package/dist/flows/smart-target.d.ts.map +1 -0
- package/dist/flows/smart-target.js +84 -0
- package/dist/flows/smart-target.js.map +1 -0
- package/dist/flows/storage.d.ts +26 -0
- package/dist/flows/storage.d.ts.map +1 -0
- package/dist/flows/storage.js +118 -0
- package/dist/flows/storage.js.map +1 -0
- package/dist/flows/templates.d.ts +4 -0
- package/dist/flows/templates.d.ts.map +1 -0
- package/dist/flows/templates.js +35 -0
- package/dist/flows/templates.js.map +1 -0
- package/dist/flows/types.d.ts +3356 -0
- package/dist/flows/types.d.ts.map +1 -0
- package/dist/flows/types.js +133 -0
- package/dist/flows/types.js.map +1 -0
- package/dist/knowledge/store.d.ts +19 -0
- package/dist/knowledge/store.d.ts.map +1 -1
- package/dist/knowledge/store.js +63 -4
- package/dist/knowledge/store.js.map +1 -1
- package/dist/sessions/browser-connect.d.ts +30 -0
- package/dist/sessions/browser-connect.d.ts.map +1 -0
- package/dist/sessions/browser-connect.js +68 -0
- package/dist/sessions/browser-connect.js.map +1 -0
- package/dist/sessions/browser-state.d.ts +35 -0
- package/dist/sessions/browser-state.d.ts.map +1 -0
- package/dist/sessions/browser-state.js +74 -0
- package/dist/sessions/browser-state.js.map +1 -0
- package/dist/sessions/inject-cookies.d.ts +20 -0
- package/dist/sessions/inject-cookies.d.ts.map +1 -0
- package/dist/sessions/inject-cookies.js +57 -0
- package/dist/sessions/inject-cookies.js.map +1 -0
- package/dist/sessions/manager.d.ts +11 -1
- package/dist/sessions/manager.d.ts.map +1 -1
- package/dist/sessions/manager.js +40 -6
- package/dist/sessions/manager.js.map +1 -1
- package/dist/snapshot/store.d.ts +8 -0
- package/dist/snapshot/store.d.ts.map +1 -1
- package/dist/snapshot/store.js +48 -0
- package/dist/snapshot/store.js.map +1 -1
- package/dist/stealth/antibot-detector.d.ts +1 -1
- package/dist/stealth/antibot-detector.d.ts.map +1 -1
- package/dist/stealth/antibot-detector.js +56 -0
- package/dist/stealth/antibot-detector.js.map +1 -1
- package/dist/stealth/browser-image-extract.d.ts +43 -0
- package/dist/stealth/browser-image-extract.d.ts.map +1 -0
- package/dist/stealth/browser-image-extract.js +268 -0
- package/dist/stealth/browser-image-extract.js.map +1 -0
- package/dist/stealth/browser.d.ts +5 -0
- package/dist/stealth/browser.d.ts.map +1 -1
- package/dist/stealth/browser.js +82 -1
- package/dist/stealth/browser.js.map +1 -1
- package/dist/stealth/chrome-profile.d.ts +1 -0
- package/dist/stealth/chrome-profile.d.ts.map +1 -1
- package/dist/stealth/chrome-profile.js +28 -5
- package/dist/stealth/chrome-profile.js.map +1 -1
- package/dist/stealth/detector.d.ts +10 -1
- package/dist/stealth/detector.d.ts.map +1 -1
- package/dist/stealth/detector.js +117 -25
- package/dist/stealth/detector.js.map +1 -1
- package/dist/stealth/headers.d.ts +1 -1
- package/dist/stealth/headers.d.ts.map +1 -1
- package/dist/stealth/headers.js +94 -2
- package/dist/stealth/headers.js.map +1 -1
- package/dist/stealth/index.d.ts +4 -0
- package/dist/stealth/index.d.ts.map +1 -1
- package/dist/stealth/index.js +207 -25
- package/dist/stealth/index.js.map +1 -1
- package/dist/stealth/proxy.d.ts +40 -1
- package/dist/stealth/proxy.d.ts.map +1 -1
- package/dist/stealth/proxy.js +90 -6
- package/dist/stealth/proxy.js.map +1 -1
- package/dist/tools/action-executor.d.ts +2 -0
- package/dist/tools/action-executor.d.ts.map +1 -1
- package/dist/tools/action-executor.js +38 -0
- package/dist/tools/action-executor.js.map +1 -1
- package/dist/tools/batch-download.d.ts +33 -0
- package/dist/tools/batch-download.d.ts.map +1 -0
- package/dist/tools/batch-download.js +208 -0
- package/dist/tools/batch-download.js.map +1 -0
- package/dist/tools/browser.d.ts +100 -0
- package/dist/tools/browser.d.ts.map +1 -0
- package/dist/tools/browser.js +448 -0
- package/dist/tools/browser.js.map +1 -0
- package/dist/tools/download.d.ts +35 -2
- package/dist/tools/download.d.ts.map +1 -1
- package/dist/tools/download.js +245 -44
- package/dist/tools/download.js.map +1 -1
- package/dist/tools/index.d.ts.map +1 -1
- package/dist/tools/index.js +23 -0
- package/dist/tools/index.js.map +1 -1
- package/dist/tools/inspect-flow.d.ts +24 -0
- package/dist/tools/inspect-flow.d.ts.map +1 -0
- package/dist/tools/inspect-flow.js +23 -0
- package/dist/tools/inspect-flow.js.map +1 -0
- package/dist/tools/interact.d.ts +28 -15
- package/dist/tools/interact.d.ts.map +1 -1
- package/dist/tools/interact.js +48 -1
- package/dist/tools/interact.js.map +1 -1
- package/dist/tools/list-flows.d.ts +21 -0
- package/dist/tools/list-flows.d.ts.map +1 -0
- package/dist/tools/list-flows.js +18 -0
- package/dist/tools/list-flows.js.map +1 -0
- package/dist/tools/manifest.d.ts.map +1 -1
- package/dist/tools/manifest.js +43 -0
- package/dist/tools/manifest.js.map +1 -1
- package/dist/tools/monitor.d.ts +46 -0
- package/dist/tools/monitor.d.ts.map +1 -0
- package/dist/tools/monitor.js +213 -0
- package/dist/tools/monitor.js.map +1 -0
- package/dist/tools/pdf-extract.d.ts +38 -0
- package/dist/tools/pdf-extract.d.ts.map +1 -0
- package/dist/tools/pdf-extract.js +244 -0
- package/dist/tools/pdf-extract.js.map +1 -0
- package/dist/tools/record-flow.d.ts +39 -0
- package/dist/tools/record-flow.d.ts.map +1 -0
- package/dist/tools/record-flow.js +406 -0
- package/dist/tools/record-flow.js.map +1 -0
- package/dist/tools/run-flow.d.ts +54 -0
- package/dist/tools/run-flow.d.ts.map +1 -0
- package/dist/tools/run-flow.js +47 -0
- package/dist/tools/run-flow.js.map +1 -0
- package/dist/tools/run-skill.d.ts +2 -2
- package/dist/tools/run-skill.d.ts.map +1 -1
- package/dist/tools/run-skill.js +1 -0
- package/dist/tools/run-skill.js.map +1 -1
- package/dist/tools/scrape.d.ts.map +1 -1
- package/dist/tools/scrape.js +17 -1
- package/dist/tools/scrape.js.map +1 -1
- package/dist/tools/serve-flow.d.ts +36 -0
- package/dist/tools/serve-flow.d.ts.map +1 -0
- package/dist/tools/serve-flow.js +42 -0
- package/dist/tools/serve-flow.js.map +1 -0
- package/dist/tools/validate-flow.d.ts +24 -0
- package/dist/tools/validate-flow.d.ts.map +1 -0
- package/dist/tools/validate-flow.js +23 -0
- package/dist/tools/validate-flow.js.map +1 -0
- package/dist/tools/watch.d.ts +68 -0
- package/dist/tools/watch.d.ts.map +1 -0
- package/dist/tools/watch.js +224 -0
- package/dist/tools/watch.js.map +1 -0
- package/dist/utils/fetcher.d.ts +13 -4
- package/dist/utils/fetcher.d.ts.map +1 -1
- package/dist/utils/fetcher.js +121 -24
- package/dist/utils/fetcher.js.map +1 -1
- package/package.json +15 -4
|
@@ -0,0 +1,224 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* watch — one-shot change detector for URLs.
|
|
3
|
+
*
|
|
4
|
+
* v2.5.0: one-shot mode only. Snapshots content, hashes it, diffs against
|
|
5
|
+
* the previous snapshot for the same URL. Fires a webhook on change.
|
|
6
|
+
*
|
|
7
|
+
* Daemon mode (SIGINT loop) is deferred to v2.6.0 — use cron externally:
|
|
8
|
+
* * /30 * * * * imperium-crawl watch --url X --output-dir /var/watch
|
|
9
|
+
*/
|
|
10
|
+
import { z } from "zod";
|
|
11
|
+
import { createHash } from "node:crypto";
|
|
12
|
+
import { mkdir, readFile, writeFile } from "node:fs/promises";
|
|
13
|
+
import { existsSync } from "node:fs";
|
|
14
|
+
import { join, resolve as resolvePath } from "node:path";
|
|
15
|
+
import { JSDOM } from "jsdom";
|
|
16
|
+
import { Readability } from "@mozilla/readability";
|
|
17
|
+
import { fetchPage } from "../utils/fetcher.js";
|
|
18
|
+
import { htmlToMarkdown } from "../utils/markdown.js";
|
|
19
|
+
import { toolResult, errorResult } from "../utils/tool-response.js";
|
|
20
|
+
import { debugLog } from "../utils/debug.js";
|
|
21
|
+
import { MAX_URL_LENGTH } from "../constants.js";
|
|
22
|
+
export const name = "watch";
|
|
23
|
+
export const description = "One-shot change detector: scrape a URL, hash its content, and compare against the last snapshot. Fires a webhook on change. Run via cron for periodic monitoring.";
|
|
24
|
+
export const schema = z.object({
|
|
25
|
+
url: z
|
|
26
|
+
.string()
|
|
27
|
+
.max(MAX_URL_LENGTH)
|
|
28
|
+
.describe("URL to watch"),
|
|
29
|
+
output_dir: z
|
|
30
|
+
.string()
|
|
31
|
+
.default("./data/watch")
|
|
32
|
+
.describe("Directory to persist snapshots and state"),
|
|
33
|
+
hash_on: z
|
|
34
|
+
.enum(["content", "readability", "markdown"])
|
|
35
|
+
.default("readability")
|
|
36
|
+
.describe("What to hash: full HTML, readability main content, or markdown"),
|
|
37
|
+
webhook: z
|
|
38
|
+
.string()
|
|
39
|
+
.max(MAX_URL_LENGTH)
|
|
40
|
+
.optional()
|
|
41
|
+
.describe("If set, POST a JSON payload to this URL on detected change"),
|
|
42
|
+
diff_format: z
|
|
43
|
+
.enum(["unified", "json"])
|
|
44
|
+
.default("unified")
|
|
45
|
+
.describe("Diff representation in the result"),
|
|
46
|
+
one_shot: z
|
|
47
|
+
.boolean()
|
|
48
|
+
.default(true)
|
|
49
|
+
.describe("v2.5.0: always true. Daemon mode lands in v2.6.0."),
|
|
50
|
+
});
|
|
51
|
+
function slugify(url) {
|
|
52
|
+
return createHash("sha1").update(url).digest("hex").slice(0, 16);
|
|
53
|
+
}
|
|
54
|
+
function hashString(s) {
|
|
55
|
+
return createHash("sha256").update(s).digest("hex");
|
|
56
|
+
}
|
|
57
|
+
function simpleUnifiedDiff(prev, next, maxLines = 200) {
|
|
58
|
+
const prevLines = prev.split("\n");
|
|
59
|
+
const nextLines = next.split("\n");
|
|
60
|
+
const prevSet = new Set(prevLines);
|
|
61
|
+
const nextSet = new Set(nextLines);
|
|
62
|
+
const out = [];
|
|
63
|
+
let removed = 0;
|
|
64
|
+
let added = 0;
|
|
65
|
+
for (const line of prevLines) {
|
|
66
|
+
if (!nextSet.has(line)) {
|
|
67
|
+
out.push(`- ${line}`);
|
|
68
|
+
removed++;
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
for (const line of nextLines) {
|
|
72
|
+
if (!prevSet.has(line)) {
|
|
73
|
+
out.push(`+ ${line}`);
|
|
74
|
+
added++;
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
const header = `@@ -${prevLines.length} +${nextLines.length} @@ (${removed} removed, ${added} added)`;
|
|
78
|
+
const body = out.slice(0, maxLines).join("\n");
|
|
79
|
+
const truncated = out.length > maxLines ? `\n... (${out.length - maxLines} more lines)` : "";
|
|
80
|
+
return `${header}\n${body}${truncated}`;
|
|
81
|
+
}
|
|
82
|
+
function jsonDiff(prev, next) {
|
|
83
|
+
const prevLines = prev.split("\n");
|
|
84
|
+
const nextLines = next.split("\n");
|
|
85
|
+
const prevSet = new Set(prevLines);
|
|
86
|
+
const nextSet = new Set(nextLines);
|
|
87
|
+
const removed = prevLines.filter((l) => !nextSet.has(l));
|
|
88
|
+
const added = nextLines.filter((l) => !prevSet.has(l));
|
|
89
|
+
return JSON.stringify({ removed, added, prev_lines: prevLines.length, next_lines: nextLines.length });
|
|
90
|
+
}
|
|
91
|
+
export async function computeSignature(html, url, hashOn) {
|
|
92
|
+
if (hashOn === "content")
|
|
93
|
+
return html;
|
|
94
|
+
if (hashOn === "markdown")
|
|
95
|
+
return htmlToMarkdown(html);
|
|
96
|
+
// readability
|
|
97
|
+
try {
|
|
98
|
+
const dom = new JSDOM(html, { url });
|
|
99
|
+
const reader = new Readability(dom.window.document);
|
|
100
|
+
const article = reader.parse();
|
|
101
|
+
if (article?.textContent)
|
|
102
|
+
return article.textContent.trim();
|
|
103
|
+
return htmlToMarkdown(html);
|
|
104
|
+
}
|
|
105
|
+
catch {
|
|
106
|
+
return htmlToMarkdown(html);
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
async function loadState(stateFile) {
|
|
110
|
+
if (!existsSync(stateFile))
|
|
111
|
+
return {};
|
|
112
|
+
try {
|
|
113
|
+
const raw = await readFile(stateFile, "utf-8");
|
|
114
|
+
return JSON.parse(raw);
|
|
115
|
+
}
|
|
116
|
+
catch {
|
|
117
|
+
return {};
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
async function saveState(stateFile, state) {
|
|
121
|
+
await writeFile(stateFile, JSON.stringify(state, null, 2), "utf-8");
|
|
122
|
+
}
|
|
123
|
+
async function fireWebhook(webhook, payload) {
|
|
124
|
+
try {
|
|
125
|
+
const res = await fetch(webhook, {
|
|
126
|
+
method: "POST",
|
|
127
|
+
headers: { "content-type": "application/json" },
|
|
128
|
+
body: JSON.stringify(payload),
|
|
129
|
+
});
|
|
130
|
+
return { fired: true, status: res.status };
|
|
131
|
+
}
|
|
132
|
+
catch (err) {
|
|
133
|
+
debugLog("watch", "webhook failed", err);
|
|
134
|
+
return { fired: false };
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
export async function runWatchOnce(input) {
|
|
138
|
+
const outDir = resolvePath(input.output_dir);
|
|
139
|
+
await mkdir(outDir, { recursive: true });
|
|
140
|
+
const stateFile = join(outDir, ".state.json");
|
|
141
|
+
const state = await loadState(stateFile);
|
|
142
|
+
const fetched = await fetchPage(input.url);
|
|
143
|
+
const signature = await computeSignature(fetched.html, input.url, input.hash_on);
|
|
144
|
+
const currentHash = hashString(signature);
|
|
145
|
+
const slug = slugify(input.url);
|
|
146
|
+
const snapshotFile = join(outDir, `${slug}.snapshot.txt`);
|
|
147
|
+
const prevSnapshotFile = join(outDir, `${slug}.previous.txt`);
|
|
148
|
+
const existing = state[input.url];
|
|
149
|
+
const firstRun = !existing;
|
|
150
|
+
const changed = !firstRun && existing.last_hash !== currentHash;
|
|
151
|
+
let previousSig = null;
|
|
152
|
+
if (existsSync(snapshotFile)) {
|
|
153
|
+
try {
|
|
154
|
+
previousSig = await readFile(snapshotFile, "utf-8");
|
|
155
|
+
}
|
|
156
|
+
catch {
|
|
157
|
+
previousSig = null;
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
// Rotate previous snapshot only when content changed
|
|
161
|
+
if (changed && previousSig !== null) {
|
|
162
|
+
await writeFile(prevSnapshotFile, previousSig, "utf-8");
|
|
163
|
+
}
|
|
164
|
+
await writeFile(snapshotFile, signature, "utf-8");
|
|
165
|
+
const nowIso = new Date().toISOString();
|
|
166
|
+
const newState = {
|
|
167
|
+
url: input.url,
|
|
168
|
+
last_hash: currentHash,
|
|
169
|
+
last_checked: nowIso,
|
|
170
|
+
last_changed: changed ? nowIso : existing?.last_changed ?? null,
|
|
171
|
+
hash_on: input.hash_on,
|
|
172
|
+
check_count: (existing?.check_count ?? 0) + 1,
|
|
173
|
+
change_count: (existing?.change_count ?? 0) + (changed ? 1 : 0),
|
|
174
|
+
};
|
|
175
|
+
state[input.url] = newState;
|
|
176
|
+
await saveState(stateFile, state);
|
|
177
|
+
let diff = null;
|
|
178
|
+
if (changed && previousSig !== null) {
|
|
179
|
+
diff =
|
|
180
|
+
input.diff_format === "unified"
|
|
181
|
+
? simpleUnifiedDiff(previousSig, signature)
|
|
182
|
+
: jsonDiff(previousSig, signature);
|
|
183
|
+
}
|
|
184
|
+
let webhookFired = false;
|
|
185
|
+
let webhookStatus;
|
|
186
|
+
if (changed && input.webhook) {
|
|
187
|
+
const payload = {
|
|
188
|
+
event: "watch.change",
|
|
189
|
+
url: input.url,
|
|
190
|
+
previous_hash: existing?.last_hash ?? null,
|
|
191
|
+
current_hash: currentHash,
|
|
192
|
+
detected_at: nowIso,
|
|
193
|
+
diff,
|
|
194
|
+
};
|
|
195
|
+
const res = await fireWebhook(input.webhook, payload);
|
|
196
|
+
webhookFired = res.fired;
|
|
197
|
+
webhookStatus = res.status;
|
|
198
|
+
}
|
|
199
|
+
return {
|
|
200
|
+
url: input.url,
|
|
201
|
+
changed,
|
|
202
|
+
first_run: firstRun,
|
|
203
|
+
previous_hash: existing?.last_hash ?? null,
|
|
204
|
+
current_hash: currentHash,
|
|
205
|
+
hash_on: input.hash_on,
|
|
206
|
+
snapshot_file: snapshotFile,
|
|
207
|
+
diff,
|
|
208
|
+
webhook_fired: webhookFired,
|
|
209
|
+
webhook_status: webhookStatus,
|
|
210
|
+
state: newState,
|
|
211
|
+
checked_at: nowIso,
|
|
212
|
+
};
|
|
213
|
+
}
|
|
214
|
+
export async function execute(input) {
|
|
215
|
+
try {
|
|
216
|
+
const result = await runWatchOnce(input);
|
|
217
|
+
return toolResult(result);
|
|
218
|
+
}
|
|
219
|
+
catch (err) {
|
|
220
|
+
debugLog("watch", "failed", err);
|
|
221
|
+
return errorResult(err instanceof Error ? err.message : String(err));
|
|
222
|
+
}
|
|
223
|
+
}
|
|
224
|
+
//# sourceMappingURL=watch.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"watch.js","sourceRoot":"","sources":["../../src/tools/watch.ts"],"names":[],"mappings":"AAAA;;;;;;;;GAQG;AAEH,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AACxB,OAAO,EAAE,UAAU,EAAE,MAAM,aAAa,CAAC;AACzC,OAAO,EAAE,KAAK,EAAE,QAAQ,EAAE,SAAS,EAAE,MAAM,kBAAkB,CAAC;AAC9D,OAAO,EAAE,UAAU,EAAE,MAAM,SAAS,CAAC;AACrC,OAAO,EAAE,IAAI,EAAE,OAAO,IAAI,WAAW,EAAE,MAAM,WAAW,CAAC;AACzD,OAAO,EAAE,KAAK,EAAE,MAAM,OAAO,CAAC;AAC9B,OAAO,EAAE,WAAW,EAAE,MAAM,sBAAsB,CAAC;AACnD,OAAO,EAAE,SAAS,EAAE,MAAM,qBAAqB,CAAC;AAChD,OAAO,EAAE,cAAc,EAAE,MAAM,sBAAsB,CAAC;AACtD,OAAO,EAAE,UAAU,EAAE,WAAW,EAAE,MAAM,2BAA2B,CAAC;AACpE,OAAO,EAAE,QAAQ,EAAE,MAAM,mBAAmB,CAAC;AAC7C,OAAO,EAAE,cAAc,EAAE,MAAM,iBAAiB,CAAC;AAEjD,MAAM,CAAC,MAAM,IAAI,GAAG,OAAO,CAAC;AAE5B,MAAM,CAAC,MAAM,WAAW,GACtB,mKAAmK,CAAC;AAEtK,MAAM,CAAC,MAAM,MAAM,GAAG,CAAC,CAAC,MAAM,CAAC;IAC7B,GAAG,EAAE,CAAC;SACH,MAAM,EAAE;SACR,GAAG,CAAC,cAAc,CAAC;SACnB,QAAQ,CAAC,cAAc,CAAC;IAC3B,UAAU,EAAE,CAAC;SACV,MAAM,EAAE;SACR,OAAO,CAAC,cAAc,CAAC;SACvB,QAAQ,CAAC,0CAA0C,CAAC;IACvD,OAAO,EAAE,CAAC;SACP,IAAI,CAAC,CAAC,SAAS,EAAE,aAAa,EAAE,UAAU,CAAC,CAAC;SAC5C,OAAO,CAAC,aAAa,CAAC;SACtB,QAAQ,CAAC,gEAAgE,CAAC;IAC7E,OAAO,EAAE,CAAC;SACP,MAAM,EAAE;SACR,GAAG,CAAC,cAAc,CAAC;SACnB,QAAQ,EAAE;SACV,QAAQ,CAAC,4DAA4D,CAAC;IACzE,WAAW,EAAE,CAAC;SACX,IAAI,CAAC,CAAC,SAAS,EAAE,MAAM,CAAC,CAAC;SACzB,OAAO,CAAC,SAAS,CAAC;SAClB,QAAQ,CAAC,mCAAmC,CAAC;IAChD,QAAQ,EAAE,CAAC;SACR,OAAO,EAAE;SACT,OAAO,CAAC,IAAI,CAAC;SACb,QAAQ,CAAC,mDAAmD,CAAC;CACjE,CAAC,CAAC;AA6BH,SAAS,OAAO,CAAC,GAAW;IAC1B,OAAO,UAAU,CAAC,MAAM,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;AACnE,CAAC;AAED,SAAS,UAAU,CAAC,CAAS;IAC3B,OAAO,UAAU,CAAC,QAAQ,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;AACtD,CAAC;AAED,SAAS,iBAAiB,CAAC,IAAY,EAAE,IAAY,EAAE,QAAQ,GAAG,GAAG;IACnE,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;IACnC,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;IACnC,MAAM,OAAO,GAAG,IAAI,GAAG,CAAC,SAAS,CAAC,CAAC;IACnC,MAAM,OAAO,GAAG,IAAI,GAAG,CAAC,SAAS,CAAC,CAAC;IAEnC,MAAM,GAAG,GAAa,EAAE,CAAC;IACzB,IAAI,OAAO,GAAG,CAAC,CAAC;IAChB,IAAI,KAAK,GAAG,CAAC,CAAC;IACd,KAAK,MAAM,IAAI,IAAI,SAAS,EAAE,CAAC;QAC7B,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC;YACvB,GAAG,CAAC,IAAI,CAAC,KAAK,IAAI,EAAE,CAAC,CAAC;YACtB,OAAO,EAAE,CAAC;QACZ,CAAC;IACH,CAAC;IACD,KAAK,MAAM,IAAI,IAAI,SAAS,EAAE,CAAC;QAC7B,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC;YACvB,GAAG,CAAC,IAAI,CAAC,KAAK,IAAI,EAAE,CAAC,CAAC;YACtB,KAAK,EAAE,CAAC;QACV,CAAC;IACH,CAAC;IACD,MAAM,MAAM,GAAG,OAAO,SAAS,CAAC,MAAM,KAAK,SAAS,CAAC,MAAM,QAAQ,OAAO,aAAa,KAAK,SAAS,CAAC;IACtG,MAAM,IAAI,GAAG,GAAG,CAAC,KAAK,CAAC,CAAC,EAAE,QAAQ,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAC/C,MAAM,SAAS,GAAG,GAAG,CAAC,MAAM,GAAG,QAAQ,CAAC,CAAC,CAAC,UAAU,GAAG,CAAC,MAAM,GAAG,QAAQ,cAAc,CAAC,CAAC,CAAC,EAAE,CAAC;IAC7F,OAAO,GAAG,MAAM,KAAK,IAAI,GAAG,SAAS,EAAE,CAAC;AAC1C,CAAC;AAED,SAAS,QAAQ,CAAC,IAAY,EAAE,IAAY;IAC1C,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;IACnC,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;IACnC,MAAM,OAAO,GAAG,IAAI,GAAG,CAAC,SAAS,CAAC,CAAC;IACnC,MAAM,OAAO,GAAG,IAAI,GAAG,CAAC,SAAS,CAAC,CAAC;IACnC,MAAM,OAAO,GAAG,SAAS,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;IACzD,MAAM,KAAK,GAAG,SAAS,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;IACvD,OAAO,IAAI,CAAC,SAAS,CAAC,EAAE,OAAO,EAAE,KAAK,EAAE,UAAU,EAAE,SAAS,CAAC,MAAM,EAAE,UAAU,EAAE,SAAS,CAAC,MAAM,EAAE,CAAC,CAAC;AACxG,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,gBAAgB,CACpC,IAAY,EACZ,GAAW,EACX,MAA6B;IAE7B,IAAI,MAAM,KAAK,SAAS;QAAE,OAAO,IAAI,CAAC;IACtC,IAAI,MAAM,KAAK,UAAU;QAAE,OAAO,cAAc,CAAC,IAAI,CAAC,CAAC;IAEvD,cAAc;IACd,IAAI,CAAC;QACH,MAAM,GAAG,GAAG,IAAI,KAAK,CAAC,IAAI,EAAE,EAAE,GAAG,EAAE,CAAC,CAAC;QACrC,MAAM,MAAM,GAAG,IAAI,WAAW,CAAC,GAAG,CAAC,MAAM,CAAC,QAAQ,CAAC,CAAC;QACpD,MAAM,OAAO,GAAG,MAAM,CAAC,KAAK,EAAE,CAAC;QAC/B,IAAI,OAAO,EAAE,WAAW;YAAE,OAAO,OAAO,CAAC,WAAW,CAAC,IAAI,EAAE,CAAC;QAC5D,OAAO,cAAc,CAAC,IAAI,CAAC,CAAC;IAC9B,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,cAAc,CAAC,IAAI,CAAC,CAAC;IAC9B,CAAC;AACH,CAAC;AAED,KAAK,UAAU,SAAS,CAAC,SAAiB;IACxC,IAAI,CAAC,UAAU,CAAC,SAAS,CAAC;QAAE,OAAO,EAAE,CAAC;IACtC,IAAI,CAAC;QACH,MAAM,GAAG,GAAG,MAAM,QAAQ,CAAC,SAAS,EAAE,OAAO,CAAC,CAAC;QAC/C,OAAO,IAAI,CAAC,KAAK,CAAC,GAAG,CAA+B,CAAC;IACvD,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,EAAE,CAAC;IACZ,CAAC;AACH,CAAC;AAED,KAAK,UAAU,SAAS,CAAC,SAAiB,EAAE,KAAiC;IAC3E,MAAM,SAAS,CAAC,SAAS,EAAE,IAAI,CAAC,SAAS,CAAC,KAAK,EAAE,IAAI,EAAE,CAAC,CAAC,EAAE,OAAO,CAAC,CAAC;AACtE,CAAC;AAED,KAAK,UAAU,WAAW,CACxB,OAAe,EACf,OAAgB;IAEhB,IAAI,CAAC;QACH,MAAM,GAAG,GAAG,MAAM,KAAK,CAAC,OAAO,EAAE;YAC/B,MAAM,EAAE,MAAM;YACd,OAAO,EAAE,EAAE,cAAc,EAAE,kBAAkB,EAAE;YAC/C,IAAI,EAAE,IAAI,CAAC,SAAS,CAAC,OAAO,CAAC;SAC9B,CAAC,CAAC;QACH,OAAO,EAAE,KAAK,EAAE,IAAI,EAAE,MAAM,EAAE,GAAG,CAAC,MAAM,EAAE,CAAC;IAC7C,CAAC;IAAC,OAAO,GAAG,EAAE,CAAC;QACb,QAAQ,CAAC,OAAO,EAAE,gBAAgB,EAAE,GAAG,CAAC,CAAC;QACzC,OAAO,EAAE,KAAK,EAAE,KAAK,EAAE,CAAC;IAC1B,CAAC;AACH,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,YAAY,CAAC,KAAiB;IAClD,MAAM,MAAM,GAAG,WAAW,CAAC,KAAK,CAAC,UAAU,CAAC,CAAC;IAC7C,MAAM,KAAK,CAAC,MAAM,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;IACzC,MAAM,SAAS,GAAG,IAAI,CAAC,MAAM,EAAE,aAAa,CAAC,CAAC;IAC9C,MAAM,KAAK,GAAG,MAAM,SAAS,CAAC,SAAS,CAAC,CAAC;IAEzC,MAAM,OAAO,GAAG,MAAM,SAAS,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;IAC3C,MAAM,SAAS,GAAG,MAAM,gBAAgB,CAAC,OAAO,CAAC,IAAI,EAAE,KAAK,CAAC,GAAG,EAAE,KAAK,CAAC,OAAO,CAAC,CAAC;IACjF,MAAM,WAAW,GAAG,UAAU,CAAC,SAAS,CAAC,CAAC;IAE1C,MAAM,IAAI,GAAG,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;IAChC,MAAM,YAAY,GAAG,IAAI,CAAC,MAAM,EAAE,GAAG,IAAI,eAAe,CAAC,CAAC;IAC1D,MAAM,gBAAgB,GAAG,IAAI,CAAC,MAAM,EAAE,GAAG,IAAI,eAAe,CAAC,CAAC;IAE9D,MAAM,QAAQ,GAAG,KAAK,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;IAClC,MAAM,QAAQ,GAAG,CAAC,QAAQ,CAAC;IAC3B,MAAM,OAAO,GAAG,CAAC,QAAQ,IAAI,QAAQ,CAAC,SAAS,KAAK,WAAW,CAAC;IAEhE,IAAI,WAAW,GAAkB,IAAI,CAAC;IACtC,IAAI,UAAU,CAAC,YAAY,CAAC,EAAE,CAAC;QAC7B,IAAI,CAAC;YACH,WAAW,GAAG,MAAM,QAAQ,CAAC,YAAY,EAAE,OAAO,CAAC,CAAC;QACtD,CAAC;QAAC,MAAM,CAAC;YACP,WAAW,GAAG,IAAI,CAAC;QACrB,CAAC;IACH,CAAC;IAED,qDAAqD;IACrD,IAAI,OAAO,IAAI,WAAW,KAAK,IAAI,EAAE,CAAC;QACpC,MAAM,SAAS,CAAC,gBAAgB,EAAE,WAAW,EAAE,OAAO,CAAC,CAAC;IAC1D,CAAC;IACD,MAAM,SAAS,CAAC,YAAY,EAAE,SAAS,EAAE,OAAO,CAAC,CAAC;IAElD,MAAM,MAAM,GAAG,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC;IACxC,MAAM,QAAQ,GAAe;QAC3B,GAAG,EAAE,KAAK,CAAC,GAAG;QACd,SAAS,EAAE,WAAW;QACtB,YAAY,EAAE,MAAM;QACpB,YAAY,EAAE,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,QAAQ,EAAE,YAAY,IAAI,IAAI;QAC/D,OAAO,EAAE,KAAK,CAAC,OAAO;QACtB,WAAW,EAAE,CAAC,QAAQ,EAAE,WAAW,IAAI,CAAC,CAAC,GAAG,CAAC;QAC7C,YAAY,EAAE,CAAC,QAAQ,EAAE,YAAY,IAAI,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;KAChE,CAAC;IACF,KAAK,CAAC,KAAK,CAAC,GAAG,CAAC,GAAG,QAAQ,CAAC;IAC5B,MAAM,SAAS,CAAC,SAAS,EAAE,KAAK,CAAC,CAAC;IAElC,IAAI,IAAI,GAAkB,IAAI,CAAC;IAC/B,IAAI,OAAO,IAAI,WAAW,KAAK,IAAI,EAAE,CAAC;QACpC,IAAI;YACF,KAAK,CAAC,WAAW,KAAK,SAAS;gBAC7B,CAAC,CAAC,iBAAiB,CAAC,WAAW,EAAE,SAAS,CAAC;gBAC3C,CAAC,CAAC,QAAQ,CAAC,WAAW,EAAE,SAAS,CAAC,CAAC;IACzC,CAAC;IAED,IAAI,YAAY,GAAG,KAAK,CAAC;IACzB,IAAI,aAAiC,CAAC;IACtC,IAAI,OAAO,IAAI,KAAK,CAAC,OAAO,EAAE,CAAC;QAC7B,MAAM,OAAO,GAAG;YACd,KAAK,EAAE,cAAc;YACrB,GAAG,EAAE,KAAK,CAAC,GAAG;YACd,aAAa,EAAE,QAAQ,EAAE,SAAS,IAAI,IAAI;YAC1C,YAAY,EAAE,WAAW;YACzB,WAAW,EAAE,MAAM;YACnB,IAAI;SACL,CAAC;QACF,MAAM,GAAG,GAAG,MAAM,WAAW,CAAC,KAAK,CAAC,OAAO,EAAE,OAAO,CAAC,CAAC;QACtD,YAAY,GAAG,GAAG,CAAC,KAAK,CAAC;QACzB,aAAa,GAAG,GAAG,CAAC,MAAM,CAAC;IAC7B,CAAC;IAED,OAAO;QACL,GAAG,EAAE,KAAK,CAAC,GAAG;QACd,OAAO;QACP,SAAS,EAAE,QAAQ;QACnB,aAAa,EAAE,QAAQ,EAAE,SAAS,IAAI,IAAI;QAC1C,YAAY,EAAE,WAAW;QACzB,OAAO,EAAE,KAAK,CAAC,OAAO;QACtB,aAAa,EAAE,YAAY;QAC3B,IAAI;QACJ,aAAa,EAAE,YAAY;QAC3B,cAAc,EAAE,aAAa;QAC7B,KAAK,EAAE,QAAQ;QACf,UAAU,EAAE,MAAM;KACnB,CAAC;AACJ,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,OAAO,CAAC,KAAiB;IAC7C,IAAI,CAAC;QACH,MAAM,MAAM,GAAG,MAAM,YAAY,CAAC,KAAK,CAAC,CAAC;QACzC,OAAO,UAAU,CAAC,MAAM,CAAC,CAAC;IAC5B,CAAC;IAAC,OAAO,GAAG,EAAE,CAAC;QACb,QAAQ,CAAC,OAAO,EAAE,QAAQ,EAAE,GAAG,CAAC,CAAC;QACjC,OAAO,WAAW,CAAC,GAAG,YAAY,KAAK,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC;IACvE,CAAC;AACH,CAAC"}
|
package/dist/utils/fetcher.d.ts
CHANGED
|
@@ -20,10 +20,19 @@ declare const CIRCUIT_OPEN_DURATION_MS = 60000;
|
|
|
20
20
|
declare const CIRCUIT_PROBE_SUCCESSES = 3;
|
|
21
21
|
declare const circuits: Map<string, CircuitBreaker>;
|
|
22
22
|
declare const CIRCUIT_STALE_MS = 3600000;
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
23
|
+
/**
|
|
24
|
+
* Get circuit breaker key for a URL.
|
|
25
|
+
* Uses domain + first 2 path segments for endpoint-level granularity.
|
|
26
|
+
*/
|
|
27
|
+
declare function getCircuitKey(url: string): string;
|
|
28
|
+
declare function getCircuit(key: string): CircuitBreaker;
|
|
29
|
+
declare function recordSuccess(key: string): void;
|
|
30
|
+
declare function recordFailure(key: string): void;
|
|
31
|
+
/**
|
|
32
|
+
* Check domain-level circuit: opens when 3+ endpoint circuits are open for this domain.
|
|
33
|
+
*/
|
|
34
|
+
declare function isDomainCircuitOpen(domain: string): boolean;
|
|
35
|
+
export { circuits, getCircuit, getCircuitKey, recordSuccess, recordFailure, isDomainCircuitOpen, CIRCUIT_FAILURE_THRESHOLD, CIRCUIT_OPEN_DURATION_MS, CIRCUIT_PROBE_SUCCESSES, CIRCUIT_STALE_MS };
|
|
27
36
|
export interface SmartFetchOptions extends StealthOptions {
|
|
28
37
|
respectRobots?: boolean;
|
|
29
38
|
retries?: number;
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"fetcher.d.ts","sourceRoot":"","sources":["../../src/utils/fetcher.ts"],"names":[],"mappings":"AAAA,OAAO,EAA4B,KAAK,WAAW,
|
|
1
|
+
{"version":3,"file":"fetcher.d.ts","sourceRoot":"","sources":["../../src/utils/fetcher.ts"],"names":[],"mappings":"AAAA,OAAO,EAA4B,KAAK,WAAW,EAAqB,KAAK,cAAc,EAAE,MAAM,qBAAqB,CAAC;AAQzH,qBAAa,kBAAkB;IAIjB,OAAO,CAAC,aAAa;IAHjC,OAAO,CAAC,OAAO,CAAK;IACpB,OAAO,CAAC,KAAK,CAAyB;gBAElB,aAAa,GAAE,MAA4B;IAEzD,GAAG,CAAC,CAAC,EAAE,EAAE,EAAE,MAAM,OAAO,CAAC,CAAC,CAAC,GAAG,OAAO,CAAC,CAAC,CAAC;CAa/C;AAED,eAAO,MAAM,cAAc,oBAA2B,CAAC;AAIvD,KAAK,YAAY,GAAG,QAAQ,GAAG,MAAM,GAAG,WAAW,CAAC;AAEpD,UAAU,cAAc;IACtB,KAAK,EAAE,YAAY,CAAC;IACpB,QAAQ,EAAE,MAAM,CAAC;IACjB,QAAQ,EAAE,MAAM,CAAC;IACjB,cAAc,EAAE,MAAM,CAAC;IACvB,YAAY,EAAE,MAAM,CAAC;CACtB;AAED,QAAA,MAAM,yBAAyB,IAAI,CAAC;AACpC,QAAA,MAAM,wBAAwB,QAAS,CAAC;AACxC,QAAA,MAAM,uBAAuB,IAAI,CAAC;AAIlC,QAAA,MAAM,QAAQ,6BAAoC,CAAC;AAGnD,QAAA,MAAM,gBAAgB,UAAY,CAAC;AAUnC;;;GAGG;AACH,iBAAS,aAAa,CAAC,GAAG,EAAE,MAAM,GAAG,MAAM,CAS1C;AAED,iBAAS,UAAU,CAAC,GAAG,EAAE,MAAM,GAAG,cAAc,CAgB/C;AAED,iBAAS,aAAa,CAAC,GAAG,EAAE,MAAM,GAAG,IAAI,CAYxC;AAED,iBAAS,aAAa,CAAC,GAAG,EAAE,MAAM,GAAG,IAAI,CAcxC;AAED;;GAEG;AACH,iBAAS,mBAAmB,CAAC,MAAM,EAAE,MAAM,GAAG,OAAO,CAQpD;AAGD,OAAO,EAAE,QAAQ,EAAE,UAAU,EAAE,aAAa,EAAE,aAAa,EAAE,aAAa,EAAE,mBAAmB,EAAE,yBAAyB,EAAE,wBAAwB,EAAE,uBAAuB,EAAE,gBAAgB,EAAE,CAAC;AAsDlM,MAAM,WAAW,iBAAkB,SAAQ,cAAc;IACvD,aAAa,CAAC,EAAE,OAAO,CAAC;IACxB,OAAO,CAAC,EAAE,MAAM,CAAC;CAClB;AA+BD,wBAAsB,SAAS,CAAC,GAAG,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE,iBAAiB,GAAG,OAAO,CAAC,WAAW,CAAC,CAiG9F"}
|
package/dist/utils/fetcher.js
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import { smartFetch, StealthError } from "../stealth/index.js";
|
|
2
2
|
import { isAllowed } from "./robots.js";
|
|
3
3
|
import { getDomain } from "./url.js";
|
|
4
|
-
import { DEFAULT_CONCURRENCY } from "../constants.js";
|
|
4
|
+
import { DEFAULT_CONCURRENCY, DEFAULT_TIMEOUT_MS, MAX_TIMEOUT_MS } from "../constants.js";
|
|
5
5
|
import { getKnowledgeEngine } from "../knowledge/index.js";
|
|
6
6
|
// ── Concurrency Limiter ──
|
|
7
7
|
export class ConcurrencyLimiter {
|
|
@@ -31,22 +31,39 @@ export const defaultLimiter = new ConcurrencyLimiter();
|
|
|
31
31
|
const CIRCUIT_FAILURE_THRESHOLD = 5;
|
|
32
32
|
const CIRCUIT_OPEN_DURATION_MS = 60_000;
|
|
33
33
|
const CIRCUIT_PROBE_SUCCESSES = 3;
|
|
34
|
+
// Domain-level circuit: higher threshold — only opens when multiple endpoints fail
|
|
35
|
+
const DOMAIN_CIRCUIT_FAILURE_THRESHOLD = 10;
|
|
34
36
|
const circuits = new Map();
|
|
35
37
|
// Periodic cleanup: remove closed circuits idle for >1 hour
|
|
36
38
|
const CIRCUIT_STALE_MS = 3_600_000;
|
|
37
39
|
setInterval(() => {
|
|
38
40
|
const now = Date.now();
|
|
39
|
-
for (const [
|
|
41
|
+
for (const [key, circuit] of circuits) {
|
|
40
42
|
if (now - circuit.lastAccessed > CIRCUIT_STALE_MS) {
|
|
41
|
-
circuits.delete(
|
|
43
|
+
circuits.delete(key);
|
|
42
44
|
}
|
|
43
45
|
}
|
|
44
46
|
}, 300_000).unref();
|
|
45
|
-
|
|
46
|
-
|
|
47
|
+
/**
|
|
48
|
+
* Get circuit breaker key for a URL.
|
|
49
|
+
* Uses domain + first 2 path segments for endpoint-level granularity.
|
|
50
|
+
*/
|
|
51
|
+
function getCircuitKey(url) {
|
|
52
|
+
try {
|
|
53
|
+
const parsed = new URL(url);
|
|
54
|
+
const pathParts = parsed.pathname.split("/").filter(Boolean);
|
|
55
|
+
const pathPrefix = pathParts.slice(0, 2).join("/");
|
|
56
|
+
return pathPrefix ? `${parsed.hostname}/${pathPrefix}` : parsed.hostname;
|
|
57
|
+
}
|
|
58
|
+
catch {
|
|
59
|
+
return getDomain(url);
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
function getCircuit(key) {
|
|
63
|
+
let circuit = circuits.get(key);
|
|
47
64
|
if (!circuit) {
|
|
48
65
|
circuit = { state: "closed", failures: 0, openedAt: 0, probeSuccesses: 0, lastAccessed: Date.now() };
|
|
49
|
-
circuits.set(
|
|
66
|
+
circuits.set(key, circuit);
|
|
50
67
|
}
|
|
51
68
|
circuit.lastAccessed = Date.now();
|
|
52
69
|
// Check if open circuit should transition to half-open
|
|
@@ -56,8 +73,8 @@ function getCircuit(domain) {
|
|
|
56
73
|
}
|
|
57
74
|
return circuit;
|
|
58
75
|
}
|
|
59
|
-
function recordSuccess(
|
|
60
|
-
const circuit = getCircuit(
|
|
76
|
+
function recordSuccess(key) {
|
|
77
|
+
const circuit = getCircuit(key);
|
|
61
78
|
if (circuit.state === "half-open") {
|
|
62
79
|
circuit.probeSuccesses++;
|
|
63
80
|
if (circuit.probeSuccesses >= CIRCUIT_PROBE_SUCCESSES) {
|
|
@@ -70,8 +87,8 @@ function recordSuccess(domain) {
|
|
|
70
87
|
circuit.failures = 0;
|
|
71
88
|
}
|
|
72
89
|
}
|
|
73
|
-
function recordFailure(
|
|
74
|
-
const circuit = getCircuit(
|
|
90
|
+
function recordFailure(key) {
|
|
91
|
+
const circuit = getCircuit(key);
|
|
75
92
|
// Half-open probe failed → immediately reopen circuit
|
|
76
93
|
if (circuit.state === "half-open") {
|
|
77
94
|
circuit.state = "open";
|
|
@@ -85,14 +102,32 @@ function recordFailure(domain) {
|
|
|
85
102
|
circuit.openedAt = Date.now();
|
|
86
103
|
}
|
|
87
104
|
}
|
|
105
|
+
/**
|
|
106
|
+
* Check domain-level circuit: opens when 3+ endpoint circuits are open for this domain.
|
|
107
|
+
*/
|
|
108
|
+
function isDomainCircuitOpen(domain) {
|
|
109
|
+
let openEndpoints = 0;
|
|
110
|
+
for (const [key, circuit] of circuits) {
|
|
111
|
+
if (key.startsWith(domain) && circuit.state === "open") {
|
|
112
|
+
openEndpoints++;
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
return openEndpoints >= 3;
|
|
116
|
+
}
|
|
88
117
|
// Exported for testing
|
|
89
|
-
export { circuits, getCircuit, recordSuccess, recordFailure, CIRCUIT_FAILURE_THRESHOLD, CIRCUIT_OPEN_DURATION_MS, CIRCUIT_PROBE_SUCCESSES, CIRCUIT_STALE_MS };
|
|
118
|
+
export { circuits, getCircuit, getCircuitKey, recordSuccess, recordFailure, isDomainCircuitOpen, CIRCUIT_FAILURE_THRESHOLD, CIRCUIT_OPEN_DURATION_MS, CIRCUIT_PROBE_SUCCESSES, CIRCUIT_STALE_MS };
|
|
90
119
|
// ── Exponential Backoff with Full Jitter (AWS pattern) ──
|
|
91
120
|
const BACKOFF_BASE_MS = 1000;
|
|
92
121
|
const BACKOFF_CAP_MS = 30_000;
|
|
93
|
-
|
|
122
|
+
const RATE_LIMIT_EXTRA_JITTER_MS = 10_000; // Extra jitter for 429 responses
|
|
123
|
+
function fullJitterBackoff(attempt, is429 = false) {
|
|
94
124
|
const expDelay = Math.min(BACKOFF_CAP_MS, BACKOFF_BASE_MS * Math.pow(2, attempt));
|
|
95
|
-
|
|
125
|
+
const baseJitter = Math.random() * expDelay;
|
|
126
|
+
// On 429, add extra random jitter to avoid thundering herd
|
|
127
|
+
if (is429) {
|
|
128
|
+
return baseJitter + 5000 + Math.random() * RATE_LIMIT_EXTRA_JITTER_MS;
|
|
129
|
+
}
|
|
130
|
+
return baseJitter;
|
|
96
131
|
}
|
|
97
132
|
// ── Per-Domain Rate Limiter ──
|
|
98
133
|
const DEFAULT_DOMAIN_RATE_MS = parseInt(process.env.DOMAIN_RATE_LIMIT_MS || "500", 10);
|
|
@@ -120,6 +155,31 @@ class DomainThrottle {
|
|
|
120
155
|
}
|
|
121
156
|
}
|
|
122
157
|
const domainThrottle = new DomainThrottle();
|
|
158
|
+
/**
|
|
159
|
+
* Compute adaptive timeout based on knowledge engine data.
|
|
160
|
+
* Uses avg_response_time * 3 with a floor of DEFAULT_TIMEOUT_MS and ceiling of MAX_TIMEOUT_MS.
|
|
161
|
+
*/
|
|
162
|
+
function computeAdaptiveTimeout(avgResponseTimeMs) {
|
|
163
|
+
if (!avgResponseTimeMs || avgResponseTimeMs <= 0)
|
|
164
|
+
return DEFAULT_TIMEOUT_MS;
|
|
165
|
+
return Math.min(MAX_TIMEOUT_MS, Math.max(DEFAULT_TIMEOUT_MS, avgResponseTimeMs * 3));
|
|
166
|
+
}
|
|
167
|
+
/**
|
|
168
|
+
* Determine the escalated stealth level for a retry attempt.
|
|
169
|
+
* attempt 0: user's level, attempt 1: level+1, attempt 2: L3
|
|
170
|
+
*/
|
|
171
|
+
function getEscalatedLevel(baseLevel, attempt, lastError) {
|
|
172
|
+
// If last failure was a StealthError with detected anti-bot, jump to L3
|
|
173
|
+
if (lastError instanceof StealthError && lastError.antiBotSystem) {
|
|
174
|
+
return 3;
|
|
175
|
+
}
|
|
176
|
+
const base = baseLevel || 1;
|
|
177
|
+
if (attempt === 0)
|
|
178
|
+
return base;
|
|
179
|
+
if (attempt === 1)
|
|
180
|
+
return Math.min(base + 1, 3);
|
|
181
|
+
return 3; // attempt >= 2 → always L3
|
|
182
|
+
}
|
|
123
183
|
export async function fetchPage(url, options) {
|
|
124
184
|
const respectRobots = options?.respectRobots ?? (process.env.RESPECT_ROBOTS !== "false");
|
|
125
185
|
if (respectRobots) {
|
|
@@ -128,40 +188,77 @@ export async function fetchPage(url, options) {
|
|
|
128
188
|
throw new Error(`URL blocked by robots.txt: ${url}`);
|
|
129
189
|
}
|
|
130
190
|
}
|
|
131
|
-
//
|
|
191
|
+
// Per-endpoint circuit breaker check
|
|
132
192
|
const domain = getDomain(url);
|
|
133
|
-
const
|
|
193
|
+
const circuitKey = getCircuitKey(url);
|
|
194
|
+
const circuit = getCircuit(circuitKey);
|
|
134
195
|
if (circuit.state === "open") {
|
|
135
|
-
throw new Error(`Circuit breaker open for ${
|
|
196
|
+
throw new Error(`Circuit breaker open for endpoint ${circuitKey} — too many consecutive failures. Retry after cooldown.`);
|
|
197
|
+
}
|
|
198
|
+
// Domain-level circuit check (opens when 3+ endpoints are broken)
|
|
199
|
+
if (isDomainCircuitOpen(domain)) {
|
|
200
|
+
throw new Error(`Circuit breaker open for domain ${domain} — multiple endpoints failing. Retry after cooldown.`);
|
|
136
201
|
}
|
|
137
202
|
// Per-domain rate limiting — use knowledge engine's safe_rate_limit if available
|
|
138
|
-
const
|
|
203
|
+
const engine = getKnowledgeEngine();
|
|
204
|
+
const knowledge = await engine.get(domain);
|
|
139
205
|
const knowledgeDelayMs = knowledge?.safe_rate_limit
|
|
140
206
|
? Math.round(60_000 / knowledge.safe_rate_limit)
|
|
141
207
|
: undefined;
|
|
142
208
|
await domainThrottle.throttle(domain, knowledgeDelayMs);
|
|
209
|
+
// ── Adaptive timeout from knowledge engine ──
|
|
210
|
+
const adaptiveTimeout = computeAdaptiveTimeout(knowledge?.avg_response_time_ms);
|
|
211
|
+
const timeout = options?.timeout || adaptiveTimeout;
|
|
143
212
|
const retries = options?.retries ?? 2;
|
|
144
213
|
let lastError;
|
|
214
|
+
let lastHttpStatus = 0;
|
|
145
215
|
for (let attempt = 0; attempt <= retries; attempt++) {
|
|
146
216
|
try {
|
|
147
|
-
|
|
148
|
-
|
|
217
|
+
// ── Smart retry: escalate stealth level on each attempt ──
|
|
218
|
+
const escalatedLevel = getEscalatedLevel(options?.forceLevel, attempt, lastError);
|
|
219
|
+
const attemptOptions = {
|
|
220
|
+
...options,
|
|
221
|
+
timeout,
|
|
222
|
+
// On retry, escalate stealth level (unless user forced a specific level)
|
|
223
|
+
forceLevel: attempt > 0 && !options?.forceLevel ? escalatedLevel : options?.forceLevel,
|
|
224
|
+
};
|
|
225
|
+
const result = await smartFetch(url, attemptOptions);
|
|
226
|
+
recordSuccess(circuitKey);
|
|
227
|
+
// Feed successful strategy back to knowledge engine
|
|
228
|
+
if (attempt > 0 && result.level > 1) {
|
|
229
|
+
engine.record({
|
|
230
|
+
url, domain,
|
|
231
|
+
levelUsed: result.level,
|
|
232
|
+
success: true,
|
|
233
|
+
responseTimeMs: 0, // Already recorded by smartFetch
|
|
234
|
+
antiBotSystem: result.antiBotSystem || null,
|
|
235
|
+
captchaType: result.captchaSolved ? "detected" : null,
|
|
236
|
+
proxyUsed: !!result.proxyUsed,
|
|
237
|
+
blocked: false,
|
|
238
|
+
httpStatus: result.status,
|
|
239
|
+
});
|
|
240
|
+
}
|
|
149
241
|
return result;
|
|
150
242
|
}
|
|
151
243
|
catch (err) {
|
|
152
244
|
lastError = err instanceof Error ? err : new Error(String(err));
|
|
153
|
-
recordFailure(
|
|
154
|
-
//
|
|
155
|
-
|
|
245
|
+
recordFailure(circuitKey);
|
|
246
|
+
// Track HTTP status for backoff decisions
|
|
247
|
+
if (err instanceof StealthError) {
|
|
248
|
+
lastHttpStatus = err.httpStatus;
|
|
249
|
+
}
|
|
250
|
+
// Check if endpoint circuit just opened
|
|
251
|
+
const updatedCircuit = getCircuit(circuitKey);
|
|
156
252
|
if (updatedCircuit.state === "open") {
|
|
157
253
|
// Enrich error message with StealthError info if available
|
|
158
254
|
const detail = err instanceof StealthError
|
|
159
255
|
? `L${err.lastLevel} HTTP ${err.httpStatus}${err.antiBotSystem ? ` [${err.antiBotSystem}]` : ""}`
|
|
160
256
|
: "";
|
|
161
|
-
throw new Error(`Circuit breaker opened for ${
|
|
257
|
+
throw new Error(`Circuit breaker opened for ${circuitKey}${detail ? ` (${detail})` : ""}: ${lastError.message}`);
|
|
162
258
|
}
|
|
163
259
|
if (attempt < retries) {
|
|
164
|
-
const
|
|
260
|
+
const is429 = lastHttpStatus === 429;
|
|
261
|
+
const delay = fullJitterBackoff(attempt, is429);
|
|
165
262
|
await new Promise((r) => setTimeout(r, delay));
|
|
166
263
|
}
|
|
167
264
|
}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"fetcher.js","sourceRoot":"","sources":["../../src/utils/fetcher.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,UAAU,EAAE,YAAY,
|
|
1
|
+
{"version":3,"file":"fetcher.js","sourceRoot":"","sources":["../../src/utils/fetcher.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,UAAU,EAAE,YAAY,EAA4D,MAAM,qBAAqB,CAAC;AACzH,OAAO,EAAE,SAAS,EAAE,MAAM,aAAa,CAAC;AACxC,OAAO,EAAE,SAAS,EAAE,MAAM,UAAU,CAAC;AACrC,OAAO,EAAE,mBAAmB,EAAE,kBAAkB,EAAE,cAAc,EAAE,MAAM,iBAAiB,CAAC;AAC1F,OAAO,EAAE,kBAAkB,EAAE,MAAM,uBAAuB,CAAC;AAE3D,4BAA4B;AAE5B,MAAM,OAAO,kBAAkB;IAIT;IAHZ,OAAO,GAAG,CAAC,CAAC;IACZ,KAAK,GAAsB,EAAE,CAAC;IAEtC,YAAoB,gBAAwB,mBAAmB;QAA3C,kBAAa,GAAb,aAAa,CAA8B;IAAG,CAAC;IAEnE,KAAK,CAAC,GAAG,CAAI,EAAoB;QAC/B,OAAO,IAAI,CAAC,OAAO,IAAI,IAAI,CAAC,aAAa,EAAE,CAAC;YAC1C,MAAM,IAAI,OAAO,CAAO,CAAC,OAAO,EAAE,EAAE,CAAC,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC,CAAC;QACjE,CAAC;QACD,IAAI,CAAC,OAAO,EAAE,CAAC;QACf,IAAI,CAAC;YACH,OAAO,MAAM,EAAE,EAAE,CAAC;QACpB,CAAC;gBAAS,CAAC;YACT,IAAI,CAAC,OAAO,EAAE,CAAC;YACf,MAAM,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,KAAK,EAAE,CAAC;YAChC,IAAI,IAAI;gBAAE,IAAI,EAAE,CAAC;QACnB,CAAC;IACH,CAAC;CACF;AAED,MAAM,CAAC,MAAM,cAAc,GAAG,IAAI,kBAAkB,EAAE,CAAC;AAcvD,MAAM,yBAAyB,GAAG,CAAC,CAAC;AACpC,MAAM,wBAAwB,GAAG,MAAM,CAAC;AACxC,MAAM,uBAAuB,GAAG,CAAC,CAAC;AAClC,mFAAmF;AACnF,MAAM,gCAAgC,GAAG,EAAE,CAAC;AAE5C,MAAM,QAAQ,GAAG,IAAI,GAAG,EAA0B,CAAC;AAEnD,4DAA4D;AAC5D,MAAM,gBAAgB,GAAG,SAAS,CAAC;AACnC,WAAW,CAAC,GAAG,EAAE;IACf,MAAM,GAAG,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;IACvB,KAAK,MAAM,CAAC,GAAG,EAAE,OAAO,CAAC,IAAI,QAAQ,EAAE,CAAC;QACtC,IAAI,GAAG,GAAG,OAAO,CAAC,YAAY,GAAG,gBAAgB,EAAE,CAAC;YAClD,QAAQ,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC;QACvB,CAAC;IACH,CAAC;AACH,CAAC,EAAE,OAAO,CAAC,CAAC,KAAK,EAAE,CAAC;AAEpB;;;GAGG;AACH,SAAS,aAAa,CAAC,GAAW;IAChC,IAAI,CAAC;QACH,MAAM,MAAM,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC;QAC5B,MAAM,SAAS,GAAG,MAAM,CAAC,QAAQ,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;QAC7D,MAAM,UAAU,GAAG,SAAS,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;QACnD,OAAO,UAAU,CAAC,CAAC,CAAC,GAAG,MAAM,CAAC,QAAQ,IAAI,UAAU,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,QAAQ,CAAC;IAC3E,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,SAAS,CAAC,GAAG,CAAC,CAAC;IACxB,CAAC;AACH,CAAC;AAED,SAAS,UAAU,CAAC,GAAW;IAC7B,IAAI,OAAO,GAAG,QAAQ,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;IAChC,IAAI,CAAC,OAAO,EAAE,CAAC;QACb,OAAO,GAAG,EAAE,KAAK,EAAE,QAAQ,EAAE,QAAQ,EAAE,CAAC,EAAE,QAAQ,EAAE,CAAC,EAAE,cAAc,EAAE,CAAC,EAAE,YAAY,EAAE,IAAI,CAAC,GAAG,EAAE,EAAE,CAAC;QACrG,QAAQ,CAAC,GAAG,CAAC,GAAG,EAAE,OAAO,CAAC,CAAC;IAC7B,CAAC;IAED,OAAO,CAAC,YAAY,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;IAElC,uDAAuD;IACvD,IAAI,OAAO,CAAC,KAAK,KAAK,MAAM,IAAI,IAAI,CAAC,GAAG,EAAE,GAAG,OAAO,CAAC,QAAQ,IAAI,wBAAwB,EAAE,CAAC;QAC1F,OAAO,CAAC,KAAK,GAAG,WAAW,CAAC;QAC5B,OAAO,CAAC,cAAc,GAAG,CAAC,CAAC;IAC7B,CAAC;IAED,OAAO,OAAO,CAAC;AACjB,CAAC;AAED,SAAS,aAAa,CAAC,GAAW;IAChC,MAAM,OAAO,GAAG,UAAU,CAAC,GAAG,CAAC,CAAC;IAChC,IAAI,OAAO,CAAC,KAAK,KAAK,WAAW,EAAE,CAAC;QAClC,OAAO,CAAC,cAAc,EAAE,CAAC;QACzB,IAAI,OAAO,CAAC,cAAc,IAAI,uBAAuB,EAAE,CAAC;YACtD,kBAAkB;YAClB,OAAO,CAAC,KAAK,GAAG,QAAQ,CAAC;YACzB,OAAO,CAAC,QAAQ,GAAG,CAAC,CAAC;QACvB,CAAC;IACH,CAAC;SAAM,CAAC;QACN,OAAO,CAAC,QAAQ,GAAG,CAAC,CAAC;IACvB,CAAC;AACH,CAAC;AAED,SAAS,aAAa,CAAC,GAAW;IAChC,MAAM,OAAO,GAAG,UAAU,CAAC,GAAG,CAAC,CAAC;IAChC,sDAAsD;IACtD,IAAI,OAAO,CAAC,KAAK,KAAK,WAAW,EAAE,CAAC;QAClC,OAAO,CAAC,KAAK,GAAG,MAAM,CAAC;QACvB,OAAO,CAAC,QAAQ,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;QAC9B,OAAO,CAAC,QAAQ,GAAG,yBAAyB,CAAC;QAC7C,OAAO;IACT,CAAC;IACD,OAAO,CAAC,QAAQ,EAAE,CAAC;IACnB,IAAI,OAAO,CAAC,QAAQ,IAAI,yBAAyB,EAAE,CAAC;QAClD,OAAO,CAAC,KAAK,GAAG,MAAM,CAAC;QACvB,OAAO,CAAC,QAAQ,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;IAChC,CAAC;AACH,CAAC;AAED;;GAEG;AACH,SAAS,mBAAmB,CAAC,MAAc;IACzC,IAAI,aAAa,GAAG,CAAC,CAAC;IACtB,KAAK,MAAM,CAAC,GAAG,EAAE,OAAO,CAAC,IAAI,QAAQ,EAAE,CAAC;QACtC,IAAI,GAAG,CAAC,UAAU,CAAC,MAAM,CAAC,IAAI,OAAO,CAAC,KAAK,KAAK,MAAM,EAAE,CAAC;YACvD,aAAa,EAAE,CAAC;QAClB,CAAC;IACH,CAAC;IACD,OAAO,aAAa,IAAI,CAAC,CAAC;AAC5B,CAAC;AAED,uBAAuB;AACvB,OAAO,EAAE,QAAQ,EAAE,UAAU,EAAE,aAAa,EAAE,aAAa,EAAE,aAAa,EAAE,mBAAmB,EAAE,yBAAyB,EAAE,wBAAwB,EAAE,uBAAuB,EAAE,gBAAgB,EAAE,CAAC;AAElM,2DAA2D;AAE3D,MAAM,eAAe,GAAG,IAAI,CAAC;AAC7B,MAAM,cAAc,GAAG,MAAM,CAAC;AAC9B,MAAM,0BAA0B,GAAG,MAAM,CAAC,CAAC,iCAAiC;AAE5E,SAAS,iBAAiB,CAAC,OAAe,EAAE,KAAK,GAAG,KAAK;IACvD,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,CAAC,cAAc,EAAE,eAAe,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,OAAO,CAAC,CAAC,CAAC;IAClF,MAAM,UAAU,GAAG,IAAI,CAAC,MAAM,EAAE,GAAG,QAAQ,CAAC;IAC5C,2DAA2D;IAC3D,IAAI,KAAK,EAAE,CAAC;QACV,OAAO,UAAU,GAAG,IAAI,GAAG,IAAI,CAAC,MAAM,EAAE,GAAG,0BAA0B,CAAC;IACxE,CAAC;IACD,OAAO,UAAU,CAAC;AACpB,CAAC;AAED,gCAAgC;AAEhC,MAAM,sBAAsB,GAAG,QAAQ,CAAC,OAAO,CAAC,GAAG,CAAC,oBAAoB,IAAI,KAAK,EAAE,EAAE,CAAC,CAAC;AAEvF,MAAM,cAAc;IACV,WAAW,GAAG,IAAI,GAAG,EAAkB,CAAC;IACxC,YAAY,CAAS;IAE7B,YAAY,iBAAyB,sBAAsB;QACzD,IAAI,CAAC,YAAY,GAAG,cAAc,CAAC;IACrC,CAAC;IAED;;;OAGG;IACH,KAAK,CAAC,QAAQ,CAAC,MAAc,EAAE,gBAAyB;QACtD,MAAM,KAAK,GAAG,gBAAgB,IAAI,IAAI,CAAC,YAAY,CAAC;QACpD,IAAI,KAAK,IAAI,CAAC;YAAE,OAAO;QAEvB,MAAM,GAAG,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;QACvB,MAAM,IAAI,GAAG,IAAI,CAAC,WAAW,CAAC,GAAG,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;QAC/C,MAAM,OAAO,GAAG,GAAG,GAAG,IAAI,CAAC;QAE3B,IAAI,OAAO,GAAG,KAAK,EAAE,CAAC;YACpB,MAAM,IAAI,OAAO,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,UAAU,CAAC,CAAC,EAAE,KAAK,GAAG,OAAO,CAAC,CAAC,CAAC;QAC3D,CAAC;QAED,IAAI,CAAC,WAAW,CAAC,GAAG,CAAC,MAAM,EAAE,IAAI,CAAC,GAAG,EAAE,CAAC,CAAC;IAC3C,CAAC;CACF;AAED,MAAM,cAAc,GAAG,IAAI,cAAc,EAAE,CAAC;AAS5C;;;GAGG;AACH,SAAS,sBAAsB,CAAC,iBAAqC;IACnE,IAAI,CAAC,iBAAiB,IAAI,iBAAiB,IAAI,CAAC;QAAE,OAAO,kBAAkB,CAAC;IAC5E,OAAO,IAAI,CAAC,GAAG,CAAC,cAAc,EAAE,IAAI,CAAC,GAAG,CAAC,kBAAkB,EAAE,iBAAiB,GAAG,CAAC,CAAC,CAAC,CAAC;AACvF,CAAC;AAED;;;GAGG;AACH,SAAS,iBAAiB,CACxB,SAAmC,EACnC,OAAe,EACf,SAA4B;IAE5B,wEAAwE;IACxE,IAAI,SAAS,YAAY,YAAY,IAAI,SAAS,CAAC,aAAa,EAAE,CAAC;QACjE,OAAO,CAAC,CAAC;IACX,CAAC;IAED,MAAM,IAAI,GAAG,SAAS,IAAI,CAAC,CAAC;IAC5B,IAAI,OAAO,KAAK,CAAC;QAAE,OAAO,IAAI,CAAC;IAC/B,IAAI,OAAO,KAAK,CAAC;QAAE,OAAO,IAAI,CAAC,GAAG,CAAC,IAAI,GAAG,CAAC,EAAE,CAAC,CAAiB,CAAC;IAChE,OAAO,CAAC,CAAC,CAAC,2BAA2B;AACvC,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,SAAS,CAAC,GAAW,EAAE,OAA2B;IACtE,MAAM,aAAa,GAAG,OAAO,EAAE,aAAa,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,cAAc,KAAK,OAAO,CAAC,CAAC;IAEzF,IAAI,aAAa,EAAE,CAAC;QAClB,MAAM,OAAO,GAAG,MAAM,SAAS,CAAC,GAAG,CAAC,CAAC;QACrC,IAAI,CAAC,OAAO,EAAE,CAAC;YACb,MAAM,IAAI,KAAK,CAAC,8BAA8B,GAAG,EAAE,CAAC,CAAC;QACvD,CAAC;IACH,CAAC;IAED,qCAAqC;IACrC,MAAM,MAAM,GAAG,SAAS,CAAC,GAAG,CAAC,CAAC;IAC9B,MAAM,UAAU,GAAG,aAAa,CAAC,GAAG,CAAC,CAAC;IACtC,MAAM,OAAO,GAAG,UAAU,CAAC,UAAU,CAAC,CAAC;IACvC,IAAI,OAAO,CAAC,KAAK,KAAK,MAAM,EAAE,CAAC;QAC7B,MAAM,IAAI,KAAK,CAAC,qCAAqC,UAAU,yDAAyD,CAAC,CAAC;IAC5H,CAAC;IAED,kEAAkE;IAClE,IAAI,mBAAmB,CAAC,MAAM,CAAC,EAAE,CAAC;QAChC,MAAM,IAAI,KAAK,CAAC,mCAAmC,MAAM,sDAAsD,CAAC,CAAC;IACnH,CAAC;IAED,iFAAiF;IACjF,MAAM,MAAM,GAAG,kBAAkB,EAAE,CAAC;IACpC,MAAM,SAAS,GAAG,MAAM,MAAM,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC;IAC3C,MAAM,gBAAgB,GAAG,SAAS,EAAE,eAAe;QACjD,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,MAAM,GAAG,SAAS,CAAC,eAAe,CAAC;QAChD,CAAC,CAAC,SAAS,CAAC;IACd,MAAM,cAAc,CAAC,QAAQ,CAAC,MAAM,EAAE,gBAAgB,CAAC,CAAC;IAExD,+CAA+C;IAC/C,MAAM,eAAe,GAAG,sBAAsB,CAAC,SAAS,EAAE,oBAAoB,CAAC,CAAC;IAChF,MAAM,OAAO,GAAG,OAAO,EAAE,OAAO,IAAI,eAAe,CAAC;IAEpD,MAAM,OAAO,GAAG,OAAO,EAAE,OAAO,IAAI,CAAC,CAAC;IACtC,IAAI,SAA4B,CAAC;IACjC,IAAI,cAAc,GAAG,CAAC,CAAC;IAEvB,KAAK,IAAI,OAAO,GAAG,CAAC,EAAE,OAAO,IAAI,OAAO,EAAE,OAAO,EAAE,EAAE,CAAC;QACpD,IAAI,CAAC;YACH,4DAA4D;YAC5D,MAAM,cAAc,GAAG,iBAAiB,CAAC,OAAO,EAAE,UAAU,EAAE,OAAO,EAAE,SAAS,CAAC,CAAC;YAClF,MAAM,cAAc,GAAmB;gBACrC,GAAG,OAAO;gBACV,OAAO;gBACP,yEAAyE;gBACzE,UAAU,EAAE,OAAO,GAAG,CAAC,IAAI,CAAC,OAAO,EAAE,UAAU,CAAC,CAAC,CAAC,cAAc,CAAC,CAAC,CAAC,OAAO,EAAE,UAAU;aACvF,CAAC;YAEF,MAAM,MAAM,GAAG,MAAM,UAAU,CAAC,GAAG,EAAE,cAAc,CAAC,CAAC;YACrD,aAAa,CAAC,UAAU,CAAC,CAAC;YAE1B,oDAAoD;YACpD,IAAI,OAAO,GAAG,CAAC,IAAI,MAAM,CAAC,KAAK,GAAG,CAAC,EAAE,CAAC;gBACpC,MAAM,CAAC,MAAM,CAAC;oBACZ,GAAG,EAAE,MAAM;oBACX,SAAS,EAAE,MAAM,CAAC,KAAK;oBACvB,OAAO,EAAE,IAAI;oBACb,cAAc,EAAE,CAAC,EAAE,iCAAiC;oBACpD,aAAa,EAAE,MAAM,CAAC,aAAa,IAAI,IAAI;oBAC3C,WAAW,EAAE,MAAM,CAAC,aAAa,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,IAAI;oBACrD,SAAS,EAAE,CAAC,CAAC,MAAM,CAAC,SAAS;oBAC7B,OAAO,EAAE,KAAK;oBACd,UAAU,EAAE,MAAM,CAAC,MAAM;iBAC1B,CAAC,CAAC;YACL,CAAC;YAED,OAAO,MAAM,CAAC;QAChB,CAAC;QAAC,OAAO,GAAG,EAAE,CAAC;YACb,SAAS,GAAG,GAAG,YAAY,KAAK,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,IAAI,KAAK,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC;YAChE,aAAa,CAAC,UAAU,CAAC,CAAC;YAE1B,0CAA0C;YAC1C,IAAI,GAAG,YAAY,YAAY,EAAE,CAAC;gBAChC,cAAc,GAAG,GAAG,CAAC,UAAU,CAAC;YAClC,CAAC;YAED,wCAAwC;YACxC,MAAM,cAAc,GAAG,UAAU,CAAC,UAAU,CAAC,CAAC;YAC9C,IAAI,cAAc,CAAC,KAAK,KAAK,MAAM,EAAE,CAAC;gBACpC,2DAA2D;gBAC3D,MAAM,MAAM,GAAG,GAAG,YAAY,YAAY;oBACxC,CAAC,CAAC,IAAI,GAAG,CAAC,SAAS,SAAS,GAAG,CAAC,UAAU,GAAG,GAAG,CAAC,aAAa,CAAC,CAAC,CAAC,KAAK,GAAG,CAAC,aAAa,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE;oBACjG,CAAC,CAAC,EAAE,CAAC;gBACP,MAAM,IAAI,KAAK,CAAC,8BAA8B,UAAU,GAAG,MAAM,CAAC,CAAC,CAAC,KAAK,MAAM,GAAG,CAAC,CAAC,CAAC,EAAE,KAAK,SAAS,CAAC,OAAO,EAAE,CAAC,CAAC;YACnH,CAAC;YAED,IAAI,OAAO,GAAG,OAAO,EAAE,CAAC;gBACtB,MAAM,KAAK,GAAG,cAAc,KAAK,GAAG,CAAC;gBACrC,MAAM,KAAK,GAAG,iBAAiB,CAAC,OAAO,EAAE,KAAK,CAAC,CAAC;gBAChD,MAAM,IAAI,OAAO,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,UAAU,CAAC,CAAC,EAAE,KAAK,CAAC,CAAC,CAAC;YACjD,CAAC;QACH,CAAC;IACH,CAAC;IAED,MAAM,SAAU,CAAC;AACnB,CAAC"}
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "imperium-crawl",
|
|
3
|
-
"version": "2.
|
|
4
|
-
"description": "
|
|
3
|
+
"version": "2.5.1",
|
|
4
|
+
"description": "39-tool open-source CLI for web scraping, PDF extraction, content monitoring, reusable browser flows, RSS aggregation, and custom skills. Zero API keys for core tools.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"bin": {
|
|
7
7
|
"imperium-crawl": "dist/index.js",
|
|
@@ -26,7 +26,9 @@
|
|
|
26
26
|
"start": "node dist/index.js",
|
|
27
27
|
"test": "vitest run",
|
|
28
28
|
"test:watch": "vitest",
|
|
29
|
-
"prepublishOnly": "npm run build"
|
|
29
|
+
"prepublishOnly": "npm run build",
|
|
30
|
+
"autoresearch": "tsx autoresearch/eval.ts",
|
|
31
|
+
"autoresearch:baseline": "tsx autoresearch/eval.ts --baseline --verbose"
|
|
30
32
|
},
|
|
31
33
|
"keywords": [
|
|
32
34
|
"scraping",
|
|
@@ -34,7 +36,15 @@
|
|
|
34
36
|
"web-search",
|
|
35
37
|
"brave-search",
|
|
36
38
|
"firecrawl",
|
|
37
|
-
"cli"
|
|
39
|
+
"cli",
|
|
40
|
+
"pdf-extract",
|
|
41
|
+
"web-monitoring",
|
|
42
|
+
"url-watch",
|
|
43
|
+
"content-diff",
|
|
44
|
+
"intelligence-digest",
|
|
45
|
+
"browser-workflows",
|
|
46
|
+
"workflow-recorder",
|
|
47
|
+
"flow-api"
|
|
38
48
|
],
|
|
39
49
|
"author": "ImperiumTech",
|
|
40
50
|
"license": "MIT",
|
|
@@ -55,6 +65,7 @@
|
|
|
55
65
|
"normalize-url": "^8.1.1",
|
|
56
66
|
"ora": "^8.2.0",
|
|
57
67
|
"p-queue": "^8.1.1",
|
|
68
|
+
"pdfjs-dist": "^4.0.379",
|
|
58
69
|
"playwright": "1.52",
|
|
59
70
|
"robots-parser": "^3.0.1",
|
|
60
71
|
"rss-parser": "^3.13.0",
|