novada-proxy-core 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/adapters/brightdata.d.ts +24 -0
- package/build/adapters/brightdata.js +56 -0
- package/build/adapters/generic.d.ts +32 -0
- package/build/adapters/generic.js +63 -0
- package/build/adapters/index.d.ts +16 -0
- package/build/adapters/index.js +42 -0
- package/build/adapters/novada.d.ts +23 -0
- package/build/adapters/novada.js +61 -0
- package/build/adapters/oxylabs.d.ts +22 -0
- package/build/adapters/oxylabs.js +54 -0
- package/build/adapters/smartproxy.d.ts +22 -0
- package/build/adapters/smartproxy.js +54 -0
- package/build/adapters/types.d.ts +58 -0
- package/build/adapters/types.js +7 -0
- package/build/config.d.ts +4 -0
- package/build/config.js +7 -0
- package/build/errors.d.ts +2 -0
- package/build/errors.js +58 -0
- package/build/index.d.ts +28 -0
- package/build/index.js +22 -0
- package/build/redact.d.ts +2 -0
- package/build/redact.js +24 -0
- package/build/tools/batch.d.ts +24 -0
- package/build/tools/batch.js +156 -0
- package/build/tools/crawl.d.ts +33 -0
- package/build/tools/crawl.js +604 -0
- package/build/tools/extract.d.ts +22 -0
- package/build/tools/extract.js +454 -0
- package/build/tools/fetch.d.ts +17 -0
- package/build/tools/fetch.js +243 -0
- package/build/tools/index.d.ts +19 -0
- package/build/tools/index.js +10 -0
- package/build/tools/map.d.ts +19 -0
- package/build/tools/map.js +131 -0
- package/build/tools/render.d.ts +8 -0
- package/build/tools/render.js +98 -0
- package/build/tools/research.d.ts +9 -0
- package/build/tools/research.js +126 -0
- package/build/tools/search.d.ts +9 -0
- package/build/tools/search.js +104 -0
- package/build/tools/session.d.ts +12 -0
- package/build/tools/session.js +108 -0
- package/build/tools/status.d.ts +2 -0
- package/build/tools/status.js +66 -0
- package/build/types.d.ts +34 -0
- package/build/types.js +1 -0
- package/build/utils.d.ts +18 -0
- package/build/utils.js +151 -0
- package/build/validation.d.ts +4 -0
- package/build/validation.js +6 -0
- package/package.json +50 -0
|
@@ -0,0 +1,604 @@
|
|
|
1
|
+
import { novadaProxyFetch } from "./fetch.js";
|
|
2
|
+
import { novadaProxyRender } from "./render.js";
|
|
3
|
+
import { htmlToText } from "../utils.js";
|
|
4
|
+
import { SAFE_COUNTRY, QUOTA_NOTE } from "../validation.js";
|
|
5
|
+
// ─── Link extraction (reused from map.ts pattern) ──────────────────────────────
|
|
6
|
+
/**
|
|
7
|
+
* Extract internal links from HTML. Resolves relative URLs against origin,
|
|
8
|
+
* deduplicates, and filters to same domain + subdomains.
|
|
9
|
+
*/
|
|
10
|
+
function extractInternalLinks(html, pageUrl, hostname, seen, includePatterns, excludePatterns) {
|
|
11
|
+
const hrefRe = /<a[^>]+href=["']([^"'#?][^"']*)["']/gi;
|
|
12
|
+
const allLinks = [];
|
|
13
|
+
const newLinks = [];
|
|
14
|
+
const pageDedup = new Set(); // dedup within this page
|
|
15
|
+
let match;
|
|
16
|
+
while ((match = hrefRe.exec(html)) !== null) {
|
|
17
|
+
const raw = match[1]?.trim();
|
|
18
|
+
if (!raw)
|
|
19
|
+
continue;
|
|
20
|
+
let resolved;
|
|
21
|
+
try {
|
|
22
|
+
resolved = new URL(raw, pageUrl).toString();
|
|
23
|
+
}
|
|
24
|
+
catch {
|
|
25
|
+
continue; // skip malformed hrefs
|
|
26
|
+
}
|
|
27
|
+
// Normalise: strip trailing slash
|
|
28
|
+
resolved = resolved.replace(/\/$/, "");
|
|
29
|
+
// Must be same domain or subdomain
|
|
30
|
+
let resolvedHostname;
|
|
31
|
+
try {
|
|
32
|
+
resolvedHostname = new URL(resolved).hostname;
|
|
33
|
+
}
|
|
34
|
+
catch {
|
|
35
|
+
continue;
|
|
36
|
+
}
|
|
37
|
+
if (resolvedHostname === hostname || resolvedHostname.endsWith(`.${hostname}`)) {
|
|
38
|
+
// Skip non-page resources (images, stylesheets, scripts, etc.)
|
|
39
|
+
const path = new URL(resolved).pathname.toLowerCase();
|
|
40
|
+
if (/\.(css|js|png|jpg|jpeg|gif|svg|ico|woff2?|ttf|eot|pdf|zip|tar|gz)$/.test(path)) {
|
|
41
|
+
continue;
|
|
42
|
+
}
|
|
43
|
+
// Apply include patterns: if set, URL must match at least one
|
|
44
|
+
if (includePatterns && includePatterns.length > 0) {
|
|
45
|
+
if (!includePatterns.some(re => re.test(resolved)))
|
|
46
|
+
continue;
|
|
47
|
+
}
|
|
48
|
+
// Apply exclude patterns: if set, skip URLs matching any
|
|
49
|
+
if (excludePatterns && excludePatterns.length > 0) {
|
|
50
|
+
if (excludePatterns.some(re => re.test(resolved)))
|
|
51
|
+
continue;
|
|
52
|
+
}
|
|
53
|
+
// Count for total_links (dedup within page only)
|
|
54
|
+
if (!pageDedup.has(resolved)) {
|
|
55
|
+
pageDedup.add(resolved);
|
|
56
|
+
allLinks.push(resolved);
|
|
57
|
+
}
|
|
58
|
+
// Count for new_links (not seen globally)
|
|
59
|
+
if (!seen.has(resolved) && !newLinks.includes(resolved)) {
|
|
60
|
+
newLinks.push(resolved);
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
return { allLinks, newLinks };
|
|
65
|
+
}
|
|
66
|
+
// ─── Title extraction ────────────────────────────────────────────────────────
|
|
67
|
+
function extractTitle(html) {
|
|
68
|
+
// Try <title> tag first
|
|
69
|
+
const titleMatch = html.match(/<title[^>]*>([\s\S]*?)<\/title>/i);
|
|
70
|
+
if (titleMatch?.[1]) {
|
|
71
|
+
const title = titleMatch[1].replace(/<[^>]+>/g, "").trim();
|
|
72
|
+
if (title)
|
|
73
|
+
return title;
|
|
74
|
+
}
|
|
75
|
+
// Fall back to first <h1>
|
|
76
|
+
const h1Match = html.match(/<h1[^>]*>([\s\S]*?)<\/h1>/i);
|
|
77
|
+
if (h1Match?.[1]) {
|
|
78
|
+
return h1Match[1].replace(/<[^>]+>/g, "").trim();
|
|
79
|
+
}
|
|
80
|
+
return undefined;
|
|
81
|
+
}
|
|
82
|
+
// ─── Simple field extraction (reuses extract.ts patterns) ──────────────────────
|
|
83
|
+
function extractSimpleField(html, field) {
|
|
84
|
+
const lower = field.toLowerCase();
|
|
85
|
+
// Open Graph meta tags
|
|
86
|
+
const ogMatch = html.match(new RegExp(`<meta[^>]+property=["']og:${lower}["'][^>]+content=["']([^"']+)["']`, "i"))
|
|
87
|
+
|| html.match(new RegExp(`<meta[^>]+content=["']([^"']+)["'][^>]+property=["']og:${lower}["']`, "i"));
|
|
88
|
+
if (ogMatch?.[1])
|
|
89
|
+
return ogMatch[1];
|
|
90
|
+
// Standard meta tags
|
|
91
|
+
const metaMatch = html.match(new RegExp(`<meta[^>]+name=["']${lower}["'][^>]+content=["']([^"']+)["']`, "i"))
|
|
92
|
+
|| html.match(new RegExp(`<meta[^>]+content=["']([^"']+)["'][^>]+name=["']${lower}["']`, "i"));
|
|
93
|
+
if (metaMatch?.[1])
|
|
94
|
+
return metaMatch[1];
|
|
95
|
+
// Special cases
|
|
96
|
+
if (lower === "title")
|
|
97
|
+
return extractTitle(html) ?? null;
|
|
98
|
+
return null;
|
|
99
|
+
}
|
|
100
|
+
// ─── Concurrency helper ────────────────────────────────────────────────────────
|
|
101
|
+
/**
|
|
102
|
+
* Process an array of items with bounded concurrency. Returns results in the
|
|
103
|
+
* same order as the input array.
|
|
104
|
+
*/
|
|
105
|
+
async function mapWithConcurrency(items, concurrency, fn) {
|
|
106
|
+
const results = new Array(items.length);
|
|
107
|
+
let activeCount = 0;
|
|
108
|
+
const queue = [];
|
|
109
|
+
function acquire() {
|
|
110
|
+
return new Promise((resolve) => {
|
|
111
|
+
if (activeCount < concurrency) {
|
|
112
|
+
activeCount++;
|
|
113
|
+
resolve();
|
|
114
|
+
}
|
|
115
|
+
else {
|
|
116
|
+
queue.push(() => {
|
|
117
|
+
activeCount++;
|
|
118
|
+
resolve();
|
|
119
|
+
});
|
|
120
|
+
}
|
|
121
|
+
});
|
|
122
|
+
}
|
|
123
|
+
function release() {
|
|
124
|
+
activeCount--;
|
|
125
|
+
const next = queue.shift();
|
|
126
|
+
if (next)
|
|
127
|
+
next();
|
|
128
|
+
}
|
|
129
|
+
await Promise.all(items.map(async (item, idx) => {
|
|
130
|
+
await acquire();
|
|
131
|
+
try {
|
|
132
|
+
results[idx] = await fn(item);
|
|
133
|
+
}
|
|
134
|
+
finally {
|
|
135
|
+
release();
|
|
136
|
+
}
|
|
137
|
+
}));
|
|
138
|
+
return results;
|
|
139
|
+
}
|
|
140
|
+
// ─── Rate limiter ────────────────────────────────────────────────────────────
|
|
141
|
+
/**
|
|
142
|
+
* Token-bucket rate limiter with a wait queue to prevent burst races.
|
|
143
|
+
* Multiple concurrent callers waiting on the same token will each get
|
|
144
|
+
* serialized via a single scheduled drain rather than per-waiter sleeps.
|
|
145
|
+
*/
|
|
146
|
+
class RateLimiter {
|
|
147
|
+
tokens;
|
|
148
|
+
lastRefill;
|
|
149
|
+
maxTokens;
|
|
150
|
+
rate; // tokens per ms
|
|
151
|
+
waitQueue = [];
|
|
152
|
+
drainScheduled = false;
|
|
153
|
+
constructor(rps) {
|
|
154
|
+
this.maxTokens = rps;
|
|
155
|
+
this.tokens = rps;
|
|
156
|
+
this.lastRefill = Date.now();
|
|
157
|
+
this.rate = rps / 1000;
|
|
158
|
+
}
|
|
159
|
+
async acquire() {
|
|
160
|
+
this.refill();
|
|
161
|
+
if (this.tokens >= 1) {
|
|
162
|
+
this.tokens--;
|
|
163
|
+
return;
|
|
164
|
+
}
|
|
165
|
+
if (this.waitQueue.length >= 100) {
|
|
166
|
+
throw new Error("Rate limiter queue full — too many concurrent requests");
|
|
167
|
+
}
|
|
168
|
+
return new Promise(resolve => {
|
|
169
|
+
this.waitQueue.push(resolve);
|
|
170
|
+
this.scheduleDrain();
|
|
171
|
+
});
|
|
172
|
+
}
|
|
173
|
+
scheduleDrain() {
|
|
174
|
+
if (this.drainScheduled)
|
|
175
|
+
return;
|
|
176
|
+
this.drainScheduled = true;
|
|
177
|
+
// Wait for just enough time for the next token to be available
|
|
178
|
+
const waitMs = Math.max(0, (1 - this.tokens) / this.rate);
|
|
179
|
+
setTimeout(() => {
|
|
180
|
+
this.drainScheduled = false;
|
|
181
|
+
this.refill();
|
|
182
|
+
while (this.tokens >= 1 && this.waitQueue.length > 0) {
|
|
183
|
+
this.tokens--;
|
|
184
|
+
this.waitQueue.shift()();
|
|
185
|
+
}
|
|
186
|
+
// If there are still waiters, schedule another drain
|
|
187
|
+
if (this.waitQueue.length > 0) {
|
|
188
|
+
this.scheduleDrain();
|
|
189
|
+
}
|
|
190
|
+
}, waitMs);
|
|
191
|
+
}
|
|
192
|
+
refill() {
|
|
193
|
+
const now = Date.now();
|
|
194
|
+
const elapsed = now - this.lastRefill;
|
|
195
|
+
this.tokens = Math.min(this.maxTokens, this.tokens + elapsed * this.rate);
|
|
196
|
+
this.lastRefill = now;
|
|
197
|
+
}
|
|
198
|
+
}
|
|
199
|
+
// ─── Page fetcher ─────────────────────────────────────────────────────────────
|
|
200
|
+
/**
|
|
201
|
+
* Fetch a page using the appropriate mode (proxy or render).
|
|
202
|
+
* Always returns raw HTML in `html` for link extraction.
|
|
203
|
+
* Returns the user's requested format in `userContent` (if different from raw HTML).
|
|
204
|
+
*/
|
|
205
|
+
async function fetchPage(url, format, renderMode, adapter, credentials, browserWs, country, timeout) {
|
|
206
|
+
if (renderMode !== "none" && browserWs) {
|
|
207
|
+
// Always fetch HTML from the browser for link extraction
|
|
208
|
+
const htmlResultStr = await novadaProxyRender({ url, format: "html", timeout }, browserWs);
|
|
209
|
+
const htmlResult = JSON.parse(htmlResultStr);
|
|
210
|
+
const rawHtml = htmlResult.data.content || "";
|
|
211
|
+
const statusCode = htmlResult.data.status_code;
|
|
212
|
+
// If user wants markdown/text, fetch that separately
|
|
213
|
+
let userContent;
|
|
214
|
+
if (format !== "raw") {
|
|
215
|
+
const userResultStr = await novadaProxyRender({ url, format: "markdown", timeout }, browserWs);
|
|
216
|
+
const userResult = JSON.parse(userResultStr);
|
|
217
|
+
userContent = userResult.data.content || "";
|
|
218
|
+
}
|
|
219
|
+
return {
|
|
220
|
+
html: rawHtml,
|
|
221
|
+
userContent,
|
|
222
|
+
statusCode,
|
|
223
|
+
cacheHit: false, // render calls are never cached
|
|
224
|
+
};
|
|
225
|
+
}
|
|
226
|
+
// Standard proxy fetch
|
|
227
|
+
const resultStr = await novadaProxyFetch({ url, format, country, timeout }, adapter, credentials);
|
|
228
|
+
const result = JSON.parse(resultStr);
|
|
229
|
+
return {
|
|
230
|
+
html: result.data.content || "",
|
|
231
|
+
statusCode: result.data.status_code,
|
|
232
|
+
cacheHit: result.meta.cache_hit === true,
|
|
233
|
+
};
|
|
234
|
+
}
|
|
235
|
+
// ─── Main crawl function ───────────────────────────────────────────────────────
|
|
236
|
+
const DEFAULT_CRAWL_CONCURRENCY = 3;
|
|
237
|
+
const DEFAULT_RATE_LIMIT = 2; // requests per second
|
|
238
|
+
export async function novadaProxyCrawl(params, adapter, credentials) {
|
|
239
|
+
// Resolve aliases: max_pages takes precedence over limit, max_depth over depth
|
|
240
|
+
// Defaults: 10 pages, depth 3 (aligned with MCP schema)
|
|
241
|
+
const maxPages = params.max_pages ?? params.limit ?? 10;
|
|
242
|
+
const maxDepth = params.max_depth ?? params.depth ?? 3;
|
|
243
|
+
const { url, include_content = false, include_patterns, exclude_patterns, render: renderMode = "none", output_format, extract_fields, country, timeout = 60, format: legacyFormat = "markdown", rate_limit: rateLimit = DEFAULT_RATE_LIMIT, browser_ws, } = params;
|
|
244
|
+
// Fix 3: Fail fast if render mode requested without browser_ws
|
|
245
|
+
if (renderMode !== "none" && !browser_ws) {
|
|
246
|
+
return JSON.stringify({
|
|
247
|
+
ok: false,
|
|
248
|
+
error: {
|
|
249
|
+
code: "PROVIDER_NOT_CONFIGURED",
|
|
250
|
+
message: "browser_ws is required when render mode is 'render' or 'browser'. Set NOVADA_BROWSER_WS env var.",
|
|
251
|
+
recoverable: false,
|
|
252
|
+
agent_instruction: "Either set NOVADA_BROWSER_WS or use render='none'.",
|
|
253
|
+
},
|
|
254
|
+
});
|
|
255
|
+
}
|
|
256
|
+
// Resolve output format: output_format takes precedence over legacy format
|
|
257
|
+
const resolvedFormat = output_format ?? (legacyFormat === "raw" ? "html" : legacyFormat);
|
|
258
|
+
// For fetch calls, translate output_format to fetch-compatible format
|
|
259
|
+
const fetchFormat = resolvedFormat === "html" ? "raw" : "markdown";
|
|
260
|
+
const startTime = Date.now();
|
|
261
|
+
// Compile include/exclude patterns
|
|
262
|
+
let compiledInclude;
|
|
263
|
+
let compiledExclude;
|
|
264
|
+
if (include_patterns && include_patterns.length > 0) {
|
|
265
|
+
compiledInclude = include_patterns.map(p => new RegExp(p, "i"));
|
|
266
|
+
}
|
|
267
|
+
if (exclude_patterns && exclude_patterns.length > 0) {
|
|
268
|
+
compiledExclude = exclude_patterns.map(p => new RegExp(p, "i"));
|
|
269
|
+
}
|
|
270
|
+
// Parse origin for relative-URL resolution and same-domain filtering
|
|
271
|
+
let origin;
|
|
272
|
+
let hostname;
|
|
273
|
+
try {
|
|
274
|
+
const parsed = new URL(url);
|
|
275
|
+
origin = parsed.origin;
|
|
276
|
+
hostname = parsed.hostname;
|
|
277
|
+
}
|
|
278
|
+
catch {
|
|
279
|
+
throw new Error(`Invalid URL: ${url}`);
|
|
280
|
+
}
|
|
281
|
+
const visited = new Set();
|
|
282
|
+
const pages = [];
|
|
283
|
+
const errors = [];
|
|
284
|
+
let totalDiscovered = 0;
|
|
285
|
+
let cachedPages = 0;
|
|
286
|
+
let extraRawFetches = 0;
|
|
287
|
+
let renderCredits = 0;
|
|
288
|
+
const rateLimiter = new RateLimiter(rateLimit);
|
|
289
|
+
let currentLevel = [{ url: url.replace(/\/$/, ""), depth: 0 }];
|
|
290
|
+
visited.add(url.replace(/\/$/, ""));
|
|
291
|
+
let deepestReached = 0;
|
|
292
|
+
// Concurrency is bounded by rate limit
|
|
293
|
+
const crawlConcurrency = Math.min(DEFAULT_CRAWL_CONCURRENCY, rateLimit);
|
|
294
|
+
while (currentLevel.length > 0 && pages.length < maxPages) {
|
|
295
|
+
// Cap this level to remaining budget
|
|
296
|
+
const remaining = maxPages - pages.length;
|
|
297
|
+
const batch = currentLevel.slice(0, remaining);
|
|
298
|
+
const batchResults = await mapWithConcurrency(batch, crawlConcurrency, async (item) => {
|
|
299
|
+
// Rate limit
|
|
300
|
+
await rateLimiter.acquire();
|
|
301
|
+
try {
|
|
302
|
+
// For link extraction we always need raw HTML. For content delivery
|
|
303
|
+
// we may need a different format.
|
|
304
|
+
const needRawForLinks = include_content && fetchFormat !== "raw";
|
|
305
|
+
const isRenderMode = renderMode !== "none";
|
|
306
|
+
// Primary fetch: get content in the user's requested format.
|
|
307
|
+
// In render mode, fetchPage always returns raw HTML in .html and
|
|
308
|
+
// the user's format in .userContent (if format !== raw).
|
|
309
|
+
const primaryFormat = include_content ? fetchFormat : "raw";
|
|
310
|
+
const primary = await fetchPage(item.url, primaryFormat === "markdown" ? "markdown" : "raw", renderMode, adapter, credentials, browser_ws, country, timeout);
|
|
311
|
+
if (isRenderMode && !primary.cacheHit) {
|
|
312
|
+
renderCredits += 5; // Browser API costs 5 credits
|
|
313
|
+
}
|
|
314
|
+
// primary.html is always raw HTML (guaranteed by fetchPage).
|
|
315
|
+
let linksHtml = primary.html;
|
|
316
|
+
let contentForUser;
|
|
317
|
+
if (isRenderMode) {
|
|
318
|
+
// fetchPage already fetched HTML for links and user format separately.
|
|
319
|
+
if (include_content) {
|
|
320
|
+
// Use userContent if available (markdown/text mode), else raw html
|
|
321
|
+
contentForUser = primary.userContent !== undefined ? primary.userContent : primary.html;
|
|
322
|
+
}
|
|
323
|
+
}
|
|
324
|
+
else if (include_content && needRawForLinks) {
|
|
325
|
+
// Proxy fetch: we fetched as markdown for the user's content, but we need raw HTML for links.
|
|
326
|
+
await rateLimiter.acquire();
|
|
327
|
+
const rawResult = await fetchPage(item.url, "raw", "none", // always use proxy for link extraction
|
|
328
|
+
adapter, credentials, undefined, country, timeout);
|
|
329
|
+
linksHtml = rawResult.html;
|
|
330
|
+
contentForUser = primary.html;
|
|
331
|
+
if (!rawResult.cacheHit)
|
|
332
|
+
extraRawFetches++;
|
|
333
|
+
}
|
|
334
|
+
else if (include_content) {
|
|
335
|
+
contentForUser = primary.html;
|
|
336
|
+
}
|
|
337
|
+
if (primary.cacheHit)
|
|
338
|
+
cachedPages++;
|
|
339
|
+
// Convert content to requested output format (non-render path only;
|
|
340
|
+
// render path already returns the correct format from fetchPage)
|
|
341
|
+
if (!isRenderMode && contentForUser !== undefined && resolvedFormat === "text") {
|
|
342
|
+
contentForUser = htmlToText(contentForUser);
|
|
343
|
+
}
|
|
344
|
+
const { allLinks, newLinks } = extractInternalLinks(linksHtml, item.url, // use current page URL as base for relative link resolution
|
|
345
|
+
hostname, visited, compiledInclude, compiledExclude);
|
|
346
|
+
// Extract title from raw HTML (always attempt)
|
|
347
|
+
const title = extractTitle(linksHtml);
|
|
348
|
+
const page = {
|
|
349
|
+
url: item.url,
|
|
350
|
+
depth: item.depth,
|
|
351
|
+
title,
|
|
352
|
+
status_code: primary.statusCode,
|
|
353
|
+
total_links: allLinks.length,
|
|
354
|
+
new_links: newLinks.length,
|
|
355
|
+
};
|
|
356
|
+
if (include_content && contentForUser !== undefined) {
|
|
357
|
+
page.content = contentForUser;
|
|
358
|
+
}
|
|
359
|
+
// Extract fields if requested (always use raw HTML for field extraction)
|
|
360
|
+
if (extract_fields && extract_fields.length > 0) {
|
|
361
|
+
const fields = {};
|
|
362
|
+
for (const field of extract_fields) {
|
|
363
|
+
fields[field] = extractSimpleField(linksHtml, field);
|
|
364
|
+
}
|
|
365
|
+
page.extracted_fields = fields;
|
|
366
|
+
}
|
|
367
|
+
return { page, newLinks };
|
|
368
|
+
}
|
|
369
|
+
catch (err) {
|
|
370
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
371
|
+
errors.push({ url: item.url, error: msg, depth: item.depth });
|
|
372
|
+
return {
|
|
373
|
+
page: {
|
|
374
|
+
url: item.url,
|
|
375
|
+
depth: item.depth,
|
|
376
|
+
total_links: 0,
|
|
377
|
+
new_links: 0,
|
|
378
|
+
error: msg,
|
|
379
|
+
},
|
|
380
|
+
newLinks: [],
|
|
381
|
+
};
|
|
382
|
+
}
|
|
383
|
+
});
|
|
384
|
+
// Collect results and build next level
|
|
385
|
+
const nextLevel = [];
|
|
386
|
+
for (const { page, newLinks } of batchResults) {
|
|
387
|
+
pages.push(page);
|
|
388
|
+
if (page.depth > deepestReached)
|
|
389
|
+
deepestReached = page.depth;
|
|
390
|
+
// Add newly discovered links to next level if within depth
|
|
391
|
+
for (const link of newLinks) {
|
|
392
|
+
totalDiscovered++;
|
|
393
|
+
if (!visited.has(link) && page.depth + 1 <= maxDepth) {
|
|
394
|
+
visited.add(link);
|
|
395
|
+
nextLevel.push({ url: link, depth: page.depth + 1 });
|
|
396
|
+
}
|
|
397
|
+
}
|
|
398
|
+
}
|
|
399
|
+
// Move to next depth level
|
|
400
|
+
currentLevel = nextLevel;
|
|
401
|
+
}
|
|
402
|
+
const latency_ms = Date.now() - startTime;
|
|
403
|
+
// Check for sitemap.xml hint
|
|
404
|
+
const sitemapUrl = `${origin}/sitemap.xml`;
|
|
405
|
+
const sitemapHint = !visited.has(sitemapUrl)
|
|
406
|
+
? `${sitemapUrl} (not crawled — check manually for a complete URL list)`
|
|
407
|
+
: undefined;
|
|
408
|
+
// Compute credit estimate
|
|
409
|
+
const baseCredits = pages.length - cachedPages;
|
|
410
|
+
const totalCredits = renderMode !== "none"
|
|
411
|
+
? renderCredits
|
|
412
|
+
: baseCredits + extraRawFetches;
|
|
413
|
+
const result = {
|
|
414
|
+
ok: true,
|
|
415
|
+
tool: "novada_proxy_crawl",
|
|
416
|
+
data: {
|
|
417
|
+
start_url: url,
|
|
418
|
+
domain: hostname,
|
|
419
|
+
pages_crawled: pages.length,
|
|
420
|
+
depth_reached: deepestReached,
|
|
421
|
+
urls_crawled: pages.length, // backward compat
|
|
422
|
+
urls_discovered: totalDiscovered,
|
|
423
|
+
pages: pages,
|
|
424
|
+
...(errors.length > 0 ? { errors: errors } : {}),
|
|
425
|
+
...(sitemapHint ? { sitemap_hint: sitemapHint } : {}),
|
|
426
|
+
},
|
|
427
|
+
meta: {
|
|
428
|
+
latency_ms,
|
|
429
|
+
country,
|
|
430
|
+
quota: {
|
|
431
|
+
credits_estimated: totalCredits,
|
|
432
|
+
note: QUOTA_NOTE,
|
|
433
|
+
},
|
|
434
|
+
},
|
|
435
|
+
};
|
|
436
|
+
if (!result.meta.country)
|
|
437
|
+
delete result.meta.country;
|
|
438
|
+
return JSON.stringify(result);
|
|
439
|
+
}
|
|
440
|
+
// ─── Validation ────────────────────────────────────────────────────────────────
|
|
441
|
+
export function validateCrawlParams(raw) {
|
|
442
|
+
if (!raw.url || typeof raw.url !== "string") {
|
|
443
|
+
throw new Error("url is required and must be a string");
|
|
444
|
+
}
|
|
445
|
+
if (!raw.url.startsWith("http://") && !raw.url.startsWith("https://")) {
|
|
446
|
+
throw new Error("url must start with http:// or https://");
|
|
447
|
+
}
|
|
448
|
+
// Handle max_pages (new) and limit (legacy) — max_pages takes precedence
|
|
449
|
+
// When neither is set: default 10 (matches MCP schema default for max_pages)
|
|
450
|
+
let limit;
|
|
451
|
+
if (raw.max_pages !== undefined) {
|
|
452
|
+
limit = Number(raw.max_pages);
|
|
453
|
+
if (!Number.isFinite(limit) || limit < 1 || limit > 100) {
|
|
454
|
+
throw new Error("max_pages must be between 1 and 100");
|
|
455
|
+
}
|
|
456
|
+
}
|
|
457
|
+
else if (raw.limit !== undefined) {
|
|
458
|
+
limit = Number(raw.limit);
|
|
459
|
+
if (!Number.isFinite(limit) || limit < 10 || limit > 200) {
|
|
460
|
+
throw new Error("limit must be between 10 and 200");
|
|
461
|
+
}
|
|
462
|
+
}
|
|
463
|
+
else {
|
|
464
|
+
limit = 10;
|
|
465
|
+
}
|
|
466
|
+
// Handle max_depth (new) and depth (legacy) — max_depth takes precedence
|
|
467
|
+
// When neither is set: default 3 (matches MCP schema default for max_depth)
|
|
468
|
+
let depth;
|
|
469
|
+
if (raw.max_depth !== undefined) {
|
|
470
|
+
depth = Number(raw.max_depth);
|
|
471
|
+
if (!Number.isFinite(depth) || depth < 1 || depth > 5) {
|
|
472
|
+
throw new Error("max_depth must be between 1 and 5");
|
|
473
|
+
}
|
|
474
|
+
}
|
|
475
|
+
else if (raw.depth !== undefined) {
|
|
476
|
+
depth = Number(raw.depth);
|
|
477
|
+
if (!Number.isFinite(depth) || depth < 1 || depth > 5) {
|
|
478
|
+
throw new Error("depth must be between 1 and 5");
|
|
479
|
+
}
|
|
480
|
+
}
|
|
481
|
+
else {
|
|
482
|
+
depth = 3;
|
|
483
|
+
}
|
|
484
|
+
if (raw.country !== undefined) {
|
|
485
|
+
if (typeof raw.country !== "string" || raw.country.length > 10 || !SAFE_COUNTRY.test(raw.country)) {
|
|
486
|
+
throw new Error("country must be a 2-letter ISO code with no hyphens (e.g. US, DE, GB)");
|
|
487
|
+
}
|
|
488
|
+
}
|
|
489
|
+
const timeout = raw.timeout !== undefined ? Number(raw.timeout) : 60;
|
|
490
|
+
if (!Number.isFinite(timeout) || timeout < 1 || timeout > 120) {
|
|
491
|
+
throw new Error("timeout must be between 1 and 120 seconds");
|
|
492
|
+
}
|
|
493
|
+
if (raw.format !== undefined && raw.format !== "raw" && raw.format !== "markdown") {
|
|
494
|
+
throw new Error("format must be 'raw' or 'markdown'");
|
|
495
|
+
}
|
|
496
|
+
// Validate output_format
|
|
497
|
+
const validOutputFormats = ["markdown", "html", "text"];
|
|
498
|
+
if (raw.output_format !== undefined && !validOutputFormats.includes(raw.output_format)) {
|
|
499
|
+
throw new Error("output_format must be 'markdown', 'html', or 'text'");
|
|
500
|
+
}
|
|
501
|
+
// Validate render mode
|
|
502
|
+
const validRenderModes = ["none", "render", "browser"];
|
|
503
|
+
if (raw.render !== undefined && !validRenderModes.includes(raw.render)) {
|
|
504
|
+
throw new Error("render must be 'none', 'render', or 'browser'");
|
|
505
|
+
}
|
|
506
|
+
// Validate include_patterns
|
|
507
|
+
let includePatterns;
|
|
508
|
+
if (raw.include_patterns !== undefined) {
|
|
509
|
+
if (!Array.isArray(raw.include_patterns)) {
|
|
510
|
+
throw new Error("include_patterns must be an array of regex strings");
|
|
511
|
+
}
|
|
512
|
+
if (raw.include_patterns.length > 20) {
|
|
513
|
+
throw new Error("include_patterns can have at most 20 entries");
|
|
514
|
+
}
|
|
515
|
+
for (const p of raw.include_patterns) {
|
|
516
|
+
if (typeof p !== "string") {
|
|
517
|
+
throw new Error("each include_pattern must be a string");
|
|
518
|
+
}
|
|
519
|
+
if (p.length > 100) {
|
|
520
|
+
throw new Error(`include_patterns entries must be at most 100 characters (got ${p.length})`);
|
|
521
|
+
}
|
|
522
|
+
// Reject nested quantifiers (ReDoS risk): (a+)+, (a*)+, (a{2,})*
|
|
523
|
+
if (/([+*}])\s*[)]\s*[+*{?]/.test(p) || /([+*])\s*[+*]/.test(p)) {
|
|
524
|
+
throw new Error(`include_patterns contains potentially unsafe nested quantifiers: ${p}`);
|
|
525
|
+
}
|
|
526
|
+
try {
|
|
527
|
+
new RegExp(p);
|
|
528
|
+
}
|
|
529
|
+
catch {
|
|
530
|
+
throw new Error(`Invalid regex in include_patterns: ${p}`);
|
|
531
|
+
}
|
|
532
|
+
}
|
|
533
|
+
includePatterns = raw.include_patterns;
|
|
534
|
+
}
|
|
535
|
+
// Validate exclude_patterns
|
|
536
|
+
let excludePatterns;
|
|
537
|
+
if (raw.exclude_patterns !== undefined) {
|
|
538
|
+
if (!Array.isArray(raw.exclude_patterns)) {
|
|
539
|
+
throw new Error("exclude_patterns must be an array of regex strings");
|
|
540
|
+
}
|
|
541
|
+
if (raw.exclude_patterns.length > 20) {
|
|
542
|
+
throw new Error("exclude_patterns can have at most 20 entries");
|
|
543
|
+
}
|
|
544
|
+
for (const p of raw.exclude_patterns) {
|
|
545
|
+
if (typeof p !== "string") {
|
|
546
|
+
throw new Error("each exclude_pattern must be a string");
|
|
547
|
+
}
|
|
548
|
+
if (p.length > 100) {
|
|
549
|
+
throw new Error(`exclude_patterns entries must be at most 100 characters (got ${p.length})`);
|
|
550
|
+
}
|
|
551
|
+
if (/([+*}])\s*[)]\s*[+*{?]/.test(p) || /([+*])\s*[+*]/.test(p)) {
|
|
552
|
+
throw new Error(`exclude_patterns contains potentially unsafe nested quantifiers: ${p}`);
|
|
553
|
+
}
|
|
554
|
+
try {
|
|
555
|
+
new RegExp(p);
|
|
556
|
+
}
|
|
557
|
+
catch {
|
|
558
|
+
throw new Error(`Invalid regex in exclude_patterns: ${p}`);
|
|
559
|
+
}
|
|
560
|
+
}
|
|
561
|
+
excludePatterns = raw.exclude_patterns;
|
|
562
|
+
}
|
|
563
|
+
// Validate extract_fields
|
|
564
|
+
let extractFields;
|
|
565
|
+
if (raw.extract_fields !== undefined) {
|
|
566
|
+
if (!Array.isArray(raw.extract_fields)) {
|
|
567
|
+
throw new Error("extract_fields must be an array of field name strings");
|
|
568
|
+
}
|
|
569
|
+
if (raw.extract_fields.length === 0 || raw.extract_fields.length > 20) {
|
|
570
|
+
throw new Error("extract_fields must have between 1 and 20 entries");
|
|
571
|
+
}
|
|
572
|
+
for (const f of raw.extract_fields) {
|
|
573
|
+
if (typeof f !== "string" || f.length > 50) {
|
|
574
|
+
throw new Error("each extract field must be a string with max 50 characters");
|
|
575
|
+
}
|
|
576
|
+
if (!/^[a-zA-Z0-9_:\-]{1,50}$/.test(f)) {
|
|
577
|
+
throw new Error(`extract_fields entries must be alphanumeric/underscore/colon/hyphen (got: ${f})`);
|
|
578
|
+
}
|
|
579
|
+
}
|
|
580
|
+
extractFields = raw.extract_fields;
|
|
581
|
+
}
|
|
582
|
+
// Validate rate_limit
|
|
583
|
+
const rateLimit = raw.rate_limit !== undefined ? Number(raw.rate_limit) : 2;
|
|
584
|
+
if (!Number.isFinite(rateLimit) || rateLimit < 0.5 || rateLimit > 10) {
|
|
585
|
+
throw new Error("rate_limit must be between 0.5 and 10 requests per second");
|
|
586
|
+
}
|
|
587
|
+
return {
|
|
588
|
+
url: raw.url,
|
|
589
|
+
depth,
|
|
590
|
+
limit,
|
|
591
|
+
max_pages: raw.max_pages !== undefined ? limit : undefined,
|
|
592
|
+
max_depth: raw.max_depth !== undefined ? depth : undefined,
|
|
593
|
+
include_content: raw.include_content === true,
|
|
594
|
+
include_patterns: includePatterns,
|
|
595
|
+
exclude_patterns: excludePatterns,
|
|
596
|
+
render: raw.render || "none",
|
|
597
|
+
output_format: raw.output_format,
|
|
598
|
+
extract_fields: extractFields,
|
|
599
|
+
country: raw.country,
|
|
600
|
+
timeout,
|
|
601
|
+
format: raw.format || "markdown",
|
|
602
|
+
rate_limit: rateLimit,
|
|
603
|
+
};
|
|
604
|
+
}
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
import type { ProxyAdapter, ProxyCredentials } from "../adapters/index.js";
|
|
2
|
+
export interface ExtractParams {
|
|
3
|
+
url: string;
|
|
4
|
+
fields: string[];
|
|
5
|
+
schema?: Record<string, string>;
|
|
6
|
+
country?: string;
|
|
7
|
+
city?: string;
|
|
8
|
+
session_id?: string;
|
|
9
|
+
timeout?: number;
|
|
10
|
+
render_fallback?: boolean;
|
|
11
|
+
}
|
|
12
|
+
export declare function shouldEscalateToRender(msg: string): boolean;
|
|
13
|
+
export declare function novadaProxyExtract(params: ExtractParams, adapter: ProxyAdapter, credentials: ProxyCredentials, browserWsEndpoint?: string): Promise<string>;
|
|
14
|
+
/**
|
|
15
|
+
* Heuristic field extraction from HTML.
|
|
16
|
+
*
|
|
17
|
+
* Uses common patterns: meta tags, Open Graph, Schema.org JSON-LD, headings,
|
|
18
|
+
* and semantic HTML. Falls back to regex scanning for common field names.
|
|
19
|
+
*/
|
|
20
|
+
export declare function extractField(html: string, field: string, baseUrl?: string): string | string[] | null;
|
|
21
|
+
export declare function deepFind(obj: unknown, key: string, depth?: number): unknown;
|
|
22
|
+
export declare function validateExtractParams(raw: Record<string, unknown>): ExtractParams;
|