novada-proxy-core 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. package/build/adapters/brightdata.d.ts +24 -0
  2. package/build/adapters/brightdata.js +56 -0
  3. package/build/adapters/generic.d.ts +32 -0
  4. package/build/adapters/generic.js +63 -0
  5. package/build/adapters/index.d.ts +16 -0
  6. package/build/adapters/index.js +42 -0
  7. package/build/adapters/novada.d.ts +23 -0
  8. package/build/adapters/novada.js +61 -0
  9. package/build/adapters/oxylabs.d.ts +22 -0
  10. package/build/adapters/oxylabs.js +54 -0
  11. package/build/adapters/smartproxy.d.ts +22 -0
  12. package/build/adapters/smartproxy.js +54 -0
  13. package/build/adapters/types.d.ts +58 -0
  14. package/build/adapters/types.js +7 -0
  15. package/build/config.d.ts +4 -0
  16. package/build/config.js +7 -0
  17. package/build/errors.d.ts +2 -0
  18. package/build/errors.js +58 -0
  19. package/build/index.d.ts +28 -0
  20. package/build/index.js +22 -0
  21. package/build/redact.d.ts +2 -0
  22. package/build/redact.js +24 -0
  23. package/build/tools/batch.d.ts +24 -0
  24. package/build/tools/batch.js +156 -0
  25. package/build/tools/crawl.d.ts +33 -0
  26. package/build/tools/crawl.js +604 -0
  27. package/build/tools/extract.d.ts +22 -0
  28. package/build/tools/extract.js +454 -0
  29. package/build/tools/fetch.d.ts +17 -0
  30. package/build/tools/fetch.js +243 -0
  31. package/build/tools/index.d.ts +19 -0
  32. package/build/tools/index.js +10 -0
  33. package/build/tools/map.d.ts +19 -0
  34. package/build/tools/map.js +131 -0
  35. package/build/tools/render.d.ts +8 -0
  36. package/build/tools/render.js +98 -0
  37. package/build/tools/research.d.ts +9 -0
  38. package/build/tools/research.js +126 -0
  39. package/build/tools/search.d.ts +9 -0
  40. package/build/tools/search.js +104 -0
  41. package/build/tools/session.d.ts +12 -0
  42. package/build/tools/session.js +108 -0
  43. package/build/tools/status.d.ts +2 -0
  44. package/build/tools/status.js +66 -0
  45. package/build/types.d.ts +34 -0
  46. package/build/types.js +1 -0
  47. package/build/utils.d.ts +18 -0
  48. package/build/utils.js +151 -0
  49. package/build/validation.d.ts +4 -0
  50. package/build/validation.js +6 -0
  51. package/package.json +50 -0
@@ -0,0 +1,243 @@
1
+ import axios from "axios";
2
+ import { HttpsProxyAgent } from "https-proxy-agent";
3
+ import { HttpProxyAgent } from "http-proxy-agent";
4
+ import { gunzipSync, brotliDecompressSync, inflateSync } from "zlib";
5
+ import { DEFAULT_USER_AGENT } from "../config.js";
6
+ import { htmlToMarkdown, unicodeSafeTruncate, countHtmlTags, contentDensity } from "../utils.js";
7
+ import { SAFE_COUNTRY, SAFE_CITY, SAFE_SESSION_ID, QUOTA_NOTE } from "../validation.js";
8
+ // ─── In-process response cache ───────────────────────────────────────────────
9
+ // Eliminates duplicate proxy credits when agents re-fetch the same URL.
10
+ // Keyed by (url + format + country). Session-pinned requests are NEVER cached
11
+ // (stickiness implies same-IP routing — caching would break that guarantee).
12
+ // TTL defaults to 300s. Set PROXY4AGENT_CACHE_TTL_SECONDS=0 to disable.
13
+ const DEFAULT_CACHE_TTL_SECONDS = 300;
14
+ const MAX_CACHE_ENTRIES = 200;
15
+ const _responseCache = new Map();
16
+ /** Returns the configured TTL in seconds. 0 = cache disabled. */
17
+ export function getCacheTtl() {
18
+ const raw = Number(process.env.PROXY4AGENT_CACHE_TTL_SECONDS);
19
+ return Number.isFinite(raw) && raw >= 0 ? raw : DEFAULT_CACHE_TTL_SECONDS;
20
+ }
21
+ /** Cache key: url + format + country (country affects what you receive back). */
22
+ export function makeCacheKey(url, format, country) {
23
+ return `${url}|${format}|${country ?? ""}`;
24
+ }
25
+ /** Remove all expired entries. */
26
+ function pruneExpired() {
27
+ const now = Date.now();
28
+ for (const [key, entry] of _responseCache) {
29
+ if (entry.expires_at <= now)
30
+ _responseCache.delete(key);
31
+ }
32
+ }
33
+ /** Evict the oldest insertion when at capacity after pruning. */
34
+ function evictOldest() {
35
+ const firstKey = (_responseCache.keys().next().value);
36
+ if (firstKey !== undefined)
37
+ _responseCache.delete(firstKey);
38
+ }
39
+ /** Clear the entire cache (useful for tests and manual cache invalidation). */
40
+ export function clearResponseCache() {
41
+ _responseCache.clear();
42
+ }
43
+ function decompress(buffer, encoding) {
44
+ // When the server declares an encoding, trust it — throw on failure so the retry loop fires
45
+ if (encoding === "gzip")
46
+ return gunzipSync(buffer).toString("utf-8");
47
+ if (encoding === "br")
48
+ return brotliDecompressSync(buffer).toString("utf-8");
49
+ if (encoding === "deflate")
50
+ return inflateSync(buffer).toString("utf-8");
51
+ // No encoding header — check magic bytes before probing
52
+ if (buffer.length >= 2 && buffer[0] === 0x1f && buffer[1] === 0x8b) {
53
+ try {
54
+ return gunzipSync(buffer).toString("utf-8");
55
+ }
56
+ catch { /* corrupted gzip */ }
57
+ }
58
+ // Brotli has no reliable magic bytes — skip probe
59
+ // Deflate starts with various bytes (0x78 0x01/9C/DA common) but not reliable — skip probe
60
+ return buffer.toString("utf-8");
61
+ }
62
+ export async function novadaProxyFetch(params, adapter, credentials) {
63
+ const { url, format = "markdown", timeout = 60 } = params;
64
+ if (!url.startsWith("http://") && !url.startsWith("https://")) {
65
+ throw new Error("URL must start with http:// or https://");
66
+ }
67
+ // ── Cache lookup ────────────────────────────────────────────────────────────
68
+ // Skip cache when session_id is set: sticky sessions imply same-IP routing,
69
+ // so two agents with different session IDs would wrongly share cached content.
70
+ const ttl = getCacheTtl();
71
+ const cacheKey = !params.session_id && ttl > 0
72
+ ? makeCacheKey(url, format, params.country)
73
+ : null;
74
+ if (cacheKey) {
75
+ const hit = _responseCache.get(cacheKey);
76
+ if (hit && hit.expires_at > Date.now()) {
77
+ // LRU: refresh position in Map so this entry isn't evicted as "oldest"
78
+ _responseCache.delete(cacheKey);
79
+ _responseCache.set(cacheKey, hit);
80
+ const serveStart = Date.now();
81
+ const parsed = JSON.parse(hit.payload);
82
+ parsed.meta.cache_hit = true;
83
+ parsed.meta.cache_age_seconds = Math.floor((Date.now() - hit.cached_at) / 1000);
84
+ parsed.meta.latency_ms = Date.now() - serveStart; // ~0ms — reflects cache serve time, not proxy latency
85
+ return JSON.stringify(parsed);
86
+ }
87
+ }
88
+ // Warn if targeting params are requested but the active adapter doesn't support them
89
+ const unsupported = [];
90
+ if (params.country && !adapter.capabilities.country)
91
+ unsupported.push(`country (not supported by ${adapter.displayName})`);
92
+ if (params.city && !adapter.capabilities.city)
93
+ unsupported.push(`city (not supported by ${adapter.displayName})`);
94
+ if (params.session_id && !adapter.capabilities.sticky)
95
+ unsupported.push(`session_id/sticky (not supported by ${adapter.displayName})`);
96
+ if (unsupported.length) {
97
+ console.error(`[novada-proxy] Warning: ${unsupported.join(", ")}. Switch to Novada for full targeting support.`);
98
+ }
99
+ const proxyUrl = adapter.buildProxyUrl(credentials, params);
100
+ // HttpsProxyAgent for HTTPS targets (CONNECT tunnel + TLS); HttpProxyAgent for plain HTTP
101
+ const httpsAgent = new HttpsProxyAgent(proxyUrl);
102
+ const httpAgent = new HttpProxyAgent(proxyUrl);
103
+ let lastError = null;
104
+ const startTime = Date.now();
105
+ for (let attempt = 1; attempt <= 2; attempt++) {
106
+ try {
107
+ const response = await axios.get(url, {
108
+ httpsAgent,
109
+ httpAgent,
110
+ proxy: false,
111
+ // arraybuffer + decompress:false = we handle decompression ourselves.
112
+ // axios built-in decompress conflicts with https-proxy-agent CONNECT tunnel
113
+ // on large pages (Amazon 1.6MB returned ECONNABORTED with decompress:true).
114
+ responseType: "arraybuffer",
115
+ decompress: false,
116
+ headers: {
117
+ "User-Agent": DEFAULT_USER_AGENT,
118
+ Accept: "text/html,application/xhtml+xml,*/*;q=0.8",
119
+ "Accept-Language": "en-US,en;q=0.9",
120
+ "Accept-Encoding": "gzip, deflate, br",
121
+ },
122
+ timeout: timeout * 1000,
123
+ maxContentLength: 50 * 1024 * 1024,
124
+ maxRedirects: 5,
125
+ });
126
+ const latency_ms = Date.now() - startTime;
127
+ const encoding = response.headers["content-encoding"];
128
+ const contentType = response.headers["content-type"];
129
+ const body = decompress(Buffer.from(response.data), encoding);
130
+ const isHtml = contentType?.includes("text/html") || body.toLowerCase().includes("<html");
131
+ // Pre-truncate before expensive markdown conversion to avoid 600MB of intermediate strings
132
+ const bodyForConversion = body.length > 500_000 ? body.slice(0, 500_000) : body;
133
+ const output = format === "markdown" && isHtml ? htmlToMarkdown(bodyForConversion) : body;
134
+ const truncated = output.length > 100_000;
135
+ const finalOutput = truncated
136
+ ? unicodeSafeTruncate(output, 100_000) + "\n\n[... truncated — page is large]"
137
+ : output;
138
+ // Compute content density: ratio of useful text to tag overhead
139
+ const tagCount = isHtml ? countHtmlTags(bodyForConversion) : 0;
140
+ const content_density = isHtml
141
+ ? contentDensity(finalOutput.length, tagCount)
142
+ : 1.0;
143
+ const result = {
144
+ ok: true,
145
+ tool: "novada_proxy_fetch",
146
+ data: {
147
+ url,
148
+ status_code: response.status,
149
+ content: finalOutput,
150
+ content_type: contentType || "unknown",
151
+ size_bytes: body.length,
152
+ warnings: unsupported.length ? unsupported.map(u => `Ignored param: ${u}`) : undefined,
153
+ },
154
+ meta: {
155
+ latency_ms,
156
+ country: params.country,
157
+ session_id: params.session_id,
158
+ truncated,
159
+ content_density,
160
+ quota: { credits_estimated: 1, note: QUOTA_NOTE },
161
+ cache_hit: false,
162
+ },
163
+ };
164
+ // Remove undefined fields from data
165
+ if (!result.data.warnings)
166
+ delete result.data.warnings;
167
+ if (!result.meta.country)
168
+ delete result.meta.country;
169
+ if (!result.meta.session_id)
170
+ delete result.meta.session_id;
171
+ // ── Store in cache ──────────────────────────────────────────────────────
172
+ if (cacheKey) {
173
+ if (_responseCache.size >= MAX_CACHE_ENTRIES) {
174
+ pruneExpired();
175
+ if (_responseCache.size >= MAX_CACHE_ENTRIES)
176
+ evictOldest();
177
+ }
178
+ const now = Date.now();
179
+ _responseCache.set(cacheKey, {
180
+ payload: JSON.stringify(result),
181
+ expires_at: now + ttl * 1000,
182
+ cached_at: now,
183
+ });
184
+ }
185
+ return JSON.stringify(result);
186
+ }
187
+ catch (err) {
188
+ lastError = err instanceof Error ? err : new Error(String(err));
189
+ // Surface rate-limit errors clearly
190
+ if (axios.isAxiosError(err) && err.response?.status === 429) {
191
+ throw new Error("Rate limited (HTTP 429). Wait a moment before retrying. Consider using a session_id for consistent routing.");
192
+ }
193
+ // Only retry on network errors or 5xx — never retry 4xx (auth, not-found, etc.)
194
+ const isRetryable = !(axios.isAxiosError(err) &&
195
+ err.response &&
196
+ err.response.status < 500);
197
+ if (attempt < 2 && isRetryable) {
198
+ // Exponential backoff: 500ms before retry
199
+ await new Promise(r => setTimeout(r, 500 * attempt));
200
+ continue;
201
+ }
202
+ }
203
+ }
204
+ throw lastError;
205
+ }
206
+ export function validateFetchParams(raw) {
207
+ if (!raw.url || typeof raw.url !== "string") {
208
+ throw new Error("url is required and must be a string");
209
+ }
210
+ if (!raw.url.startsWith("http://") && !raw.url.startsWith("https://")) {
211
+ throw new Error("url must start with http:// or https://");
212
+ }
213
+ if (raw.country !== undefined) {
214
+ if (typeof raw.country !== "string" || raw.country.length > 10 || !SAFE_COUNTRY.test(raw.country)) {
215
+ throw new Error("country must be a 2-letter ISO code with no hyphens (e.g. US, DE, GB)");
216
+ }
217
+ }
218
+ if (raw.city !== undefined) {
219
+ if (typeof raw.city !== "string" || raw.city.length > 50 || !SAFE_CITY.test(raw.city)) {
220
+ throw new Error("city must contain only letters, numbers, underscores, max 50 chars (e.g. newyork, london)");
221
+ }
222
+ }
223
+ if (raw.session_id !== undefined) {
224
+ if (typeof raw.session_id !== "string" || raw.session_id.length > 64 || !SAFE_SESSION_ID.test(raw.session_id)) {
225
+ throw new Error("session_id must contain only letters, numbers, and underscores, max 64 chars (no hyphens)");
226
+ }
227
+ }
228
+ if (raw.format && raw.format !== "raw" && raw.format !== "markdown") {
229
+ throw new Error("format must be 'raw' or 'markdown'");
230
+ }
231
+ const timeout = raw.timeout !== undefined ? Number(raw.timeout) : 60;
232
+ if (!Number.isFinite(timeout) || timeout < 1 || timeout > 120) {
233
+ throw new Error("timeout must be between 1 and 120 seconds");
234
+ }
235
+ return {
236
+ url: raw.url,
237
+ country: raw.country,
238
+ city: raw.city,
239
+ session_id: raw.session_id,
240
+ format: raw.format || "markdown",
241
+ timeout,
242
+ };
243
+ }
@@ -0,0 +1,19 @@
1
+ export { novadaProxyFetch, validateFetchParams } from "./fetch.js";
2
+ export type { FetchParams } from "./fetch.js";
3
+ export { novadaProxyBatchFetch, validateBatchFetchParams } from "./batch.js";
4
+ export type { BatchFetchParams, BatchFetchResult } from "./batch.js";
5
+ export { novadaProxySearch, validateSearchParams } from "./search.js";
6
+ export type { SearchParams } from "./search.js";
7
+ export { novadaProxySession, validateSessionParams } from "./session.js";
8
+ export type { SessionParams } from "./session.js";
9
+ export { novadaProxyStatus } from "./status.js";
10
+ export { novadaProxyRender, validateRenderParams } from "./render.js";
11
+ export type { RenderParams } from "./render.js";
12
+ export { novadaProxyExtract, validateExtractParams } from "./extract.js";
13
+ export type { ExtractParams } from "./extract.js";
14
+ export { novadaProxyMap, validateMapParams } from "./map.js";
15
+ export type { MapParams } from "./map.js";
16
+ export { novadaProxyCrawl, validateCrawlParams } from "./crawl.js";
17
+ export type { CrawlParams } from "./crawl.js";
18
+ export { novadaProxyResearch, validateResearchParams } from "./research.js";
19
+ export type { ResearchParams } from "./research.js";
@@ -0,0 +1,10 @@
1
+ export { novadaProxyFetch, validateFetchParams } from "./fetch.js";
2
+ export { novadaProxyBatchFetch, validateBatchFetchParams } from "./batch.js";
3
+ export { novadaProxySearch, validateSearchParams } from "./search.js";
4
+ export { novadaProxySession, validateSessionParams } from "./session.js";
5
+ export { novadaProxyStatus } from "./status.js";
6
+ export { novadaProxyRender, validateRenderParams } from "./render.js";
7
+ export { novadaProxyExtract, validateExtractParams } from "./extract.js";
8
+ export { novadaProxyMap, validateMapParams } from "./map.js";
9
+ export { novadaProxyCrawl, validateCrawlParams } from "./crawl.js";
10
+ export { novadaProxyResearch, validateResearchParams } from "./research.js";
@@ -0,0 +1,19 @@
1
+ import type { ProxyAdapter, ProxyCredentials } from "../adapters/index.js";
2
+ export interface MapParams {
3
+ url: string;
4
+ limit?: number;
5
+ include_external?: boolean;
6
+ country?: string;
7
+ timeout?: number;
8
+ }
9
+ /**
10
+ * Crawl a URL and return all internal links found on the page (and optionally
11
+ * linked pages up to limit). This is a shallow map — it fetches the starting URL,
12
+ * extracts all <a href> links, normalises them to absolute URLs, filters to the
13
+ * same domain, and returns the list.
14
+ *
15
+ * For a full sitemap crawl, agents should call novada_proxy_map iteratively on
16
+ * the discovered URLs or use the sitemap.xml directly.
17
+ */
18
+ export declare function novadaProxyMap(params: MapParams, adapter: ProxyAdapter, credentials: ProxyCredentials): Promise<string>;
19
+ export declare function validateMapParams(raw: Record<string, unknown>): MapParams;
@@ -0,0 +1,131 @@
1
+ import { novadaProxyFetch } from "./fetch.js";
2
+ import { SAFE_COUNTRY, QUOTA_NOTE } from "../validation.js";
3
+ /**
4
+ * Crawl a URL and return all internal links found on the page (and optionally
5
+ * linked pages up to limit). This is a shallow map — it fetches the starting URL,
6
+ * extracts all <a href> links, normalises them to absolute URLs, filters to the
7
+ * same domain, and returns the list.
8
+ *
9
+ * For a full sitemap crawl, agents should call novada_proxy_map iteratively on
10
+ * the discovered URLs or use the sitemap.xml directly.
11
+ */
12
+ export async function novadaProxyMap(params, adapter, credentials) {
13
+ const { url, limit = 50, include_external = false, country, timeout = 60 } = params;
14
+ const startTime = Date.now();
15
+ // Parse origin for relative-URL resolution and same-domain filtering
16
+ let origin;
17
+ let hostname;
18
+ try {
19
+ const parsed = new URL(url);
20
+ origin = parsed.origin;
21
+ hostname = parsed.hostname;
22
+ }
23
+ catch {
24
+ throw new Error(`Invalid URL: ${url}`);
25
+ }
26
+ // Fetch the starting page
27
+ const fetchResultStr = await novadaProxyFetch({ url, format: "raw", country, timeout }, adapter, credentials);
28
+ let html;
29
+ try {
30
+ const fetchResult = JSON.parse(fetchResultStr);
31
+ html = fetchResult.data.content || "";
32
+ }
33
+ catch {
34
+ html = fetchResultStr;
35
+ }
36
+ // Extract all <a href> links
37
+ const hrefRe = /<a[^>]+href=["']([^"'#?][^"']*)["']/gi;
38
+ const seen = new Set();
39
+ const internalUrls = [];
40
+ const externalUrls = [];
41
+ let match;
42
+ while ((match = hrefRe.exec(html)) !== null) {
43
+ const raw = match[1]?.trim();
44
+ if (!raw)
45
+ continue;
46
+ let resolved;
47
+ try {
48
+ resolved = new URL(raw, origin).toString();
49
+ }
50
+ catch {
51
+ continue; // skip malformed hrefs
52
+ }
53
+ // Normalise: strip trailing slash, fragments already excluded by regex
54
+ resolved = resolved.replace(/\/$/, "");
55
+ if (seen.has(resolved))
56
+ continue;
57
+ seen.add(resolved);
58
+ // Strip query strings from internal links for cleaner output
59
+ let resolvedHostname;
60
+ try {
61
+ resolvedHostname = new URL(resolved).hostname;
62
+ }
63
+ catch {
64
+ continue;
65
+ }
66
+ if (resolvedHostname === hostname || resolvedHostname.endsWith(`.${hostname}`)) {
67
+ internalUrls.push(resolved);
68
+ }
69
+ else if (include_external) {
70
+ externalUrls.push(resolved);
71
+ }
72
+ }
73
+ // Also check for sitemap.xml at the root
74
+ const sitemapUrl = `${origin}/sitemap.xml`;
75
+ const hasSitemap = !seen.has(sitemapUrl) ? `${sitemapUrl} (check manually — not on this page)` : null;
76
+ // Apply limit
77
+ const internal = internalUrls.slice(0, limit);
78
+ const external = include_external ? externalUrls.slice(0, Math.max(0, limit - internal.length)) : [];
79
+ const latency_ms = Date.now() - startTime;
80
+ const result = {
81
+ ok: true,
82
+ tool: "novada_proxy_map",
83
+ data: {
84
+ source_url: url,
85
+ domain: hostname,
86
+ internal_url_count: internal.length,
87
+ external_url_count: external.length,
88
+ total_found: internalUrls.length + (include_external ? externalUrls.length : 0),
89
+ truncated: internalUrls.length > limit,
90
+ internal_urls: internal,
91
+ ...(include_external ? { external_urls: external } : {}),
92
+ ...(hasSitemap ? { sitemap_hint: hasSitemap } : {}),
93
+ },
94
+ meta: {
95
+ latency_ms,
96
+ country,
97
+ quota: { credits_estimated: 1, note: QUOTA_NOTE },
98
+ },
99
+ };
100
+ if (!result.meta.country)
101
+ delete result.meta.country;
102
+ return JSON.stringify(result);
103
+ }
104
+ export function validateMapParams(raw) {
105
+ if (!raw.url || typeof raw.url !== "string") {
106
+ throw new Error("url is required and must be a string");
107
+ }
108
+ if (!raw.url.startsWith("http://") && !raw.url.startsWith("https://")) {
109
+ throw new Error("url must start with http:// or https://");
110
+ }
111
+ const limit = raw.limit !== undefined ? Number(raw.limit) : 50;
112
+ if (!Number.isFinite(limit) || limit < 10 || limit > 200) {
113
+ throw new Error("limit must be between 10 and 200");
114
+ }
115
+ if (raw.country !== undefined) {
116
+ if (typeof raw.country !== "string" || raw.country.length > 10 || !SAFE_COUNTRY.test(raw.country)) {
117
+ throw new Error("country must be a 2-letter ISO code with no hyphens (e.g. US, DE, GB)");
118
+ }
119
+ }
120
+ const timeout = raw.timeout !== undefined ? Number(raw.timeout) : 60;
121
+ if (!Number.isFinite(timeout) || timeout < 1 || timeout > 120) {
122
+ throw new Error("timeout must be between 1 and 120 seconds");
123
+ }
124
+ return {
125
+ url: raw.url,
126
+ limit,
127
+ include_external: raw.include_external === true,
128
+ country: raw.country,
129
+ timeout,
130
+ };
131
+ }
@@ -0,0 +1,8 @@
1
+ export interface RenderParams {
2
+ url: string;
3
+ format?: "markdown" | "html" | "text";
4
+ wait_for?: string;
5
+ timeout?: number;
6
+ }
7
+ export declare function novadaProxyRender(params: RenderParams, browserWsEndpoint: string): Promise<string>;
8
+ export declare function validateRenderParams(raw: Record<string, unknown>): RenderParams;
@@ -0,0 +1,98 @@
1
+ import puppeteer from "puppeteer-core";
2
+ import { htmlToMarkdown, htmlToText, unicodeSafeTruncate } from "../utils.js";
3
+ import { QUOTA_NOTE } from "../validation.js";
4
+ export async function novadaProxyRender(params, browserWsEndpoint) {
5
+ const { url, format = "markdown", wait_for, timeout = 60 } = params;
6
+ if (!url.startsWith("http://") && !url.startsWith("https://")) {
7
+ throw new Error("URL must start with http:// or https://");
8
+ }
9
+ const startTime = Date.now();
10
+ const browser = await puppeteer.connect({
11
+ browserWSEndpoint: browserWsEndpoint,
12
+ defaultViewport: { width: 1366, height: 768 },
13
+ });
14
+ try {
15
+ const page = await browser.newPage();
16
+ try {
17
+ // Use a shared deadline so goto + waitForSelector together never exceed timeout
18
+ const deadline = Date.now() + timeout * 1000;
19
+ const response = await page.goto(url, {
20
+ waitUntil: "domcontentloaded",
21
+ timeout: timeout * 1000,
22
+ });
23
+ if (wait_for) {
24
+ const remaining = deadline - Date.now();
25
+ if (remaining <= 0)
26
+ throw new Error(`Timeout waiting for selector: ${wait_for}`);
27
+ await page.waitForSelector(wait_for, { timeout: remaining });
28
+ }
29
+ const html = await page.content();
30
+ const content = format === "html" ? html
31
+ : format === "text" ? htmlToText(html)
32
+ : htmlToMarkdown(html);
33
+ const truncated = content.length > 100_000;
34
+ const finalContent = truncated
35
+ ? unicodeSafeTruncate(content, 100_000) + "\n\n[... truncated — rendered page is large]"
36
+ : content;
37
+ const latency_ms = Date.now() - startTime;
38
+ const statusCode = response?.status() ?? 200;
39
+ const result = {
40
+ ok: true,
41
+ tool: "novada_proxy_render",
42
+ data: {
43
+ url,
44
+ status_code: statusCode,
45
+ content: finalContent,
46
+ content_type: "text/html",
47
+ size_bytes: html.length,
48
+ format,
49
+ },
50
+ meta: {
51
+ latency_ms,
52
+ truncated,
53
+ quota: { credits_estimated: 5, note: "Browser API is metered separately — " + QUOTA_NOTE },
54
+ },
55
+ };
56
+ return JSON.stringify(result);
57
+ }
58
+ finally {
59
+ // Always close the page to avoid server-side session leak (billed by session-second)
60
+ await page.close().catch(() => { });
61
+ }
62
+ }
63
+ finally {
64
+ // Always disconnect even if newPage() throws (quota exhaustion, WS drop)
65
+ await browser.disconnect();
66
+ }
67
+ }
68
+ export function validateRenderParams(raw) {
69
+ if (!raw.url || typeof raw.url !== "string") {
70
+ throw new Error("url is required");
71
+ }
72
+ if (!raw.url.startsWith("http://") && !raw.url.startsWith("https://")) {
73
+ throw new Error("url must start with http:// or https://");
74
+ }
75
+ const validFormats = ["markdown", "html", "text"];
76
+ if (raw.format && !validFormats.includes(raw.format)) {
77
+ throw new Error("format must be markdown, html, or text");
78
+ }
79
+ if (raw.wait_for !== undefined) {
80
+ if (typeof raw.wait_for !== "string" || raw.wait_for.length > 200) {
81
+ throw new Error("wait_for must be a CSS selector string (max 200 chars)");
82
+ }
83
+ // Allowlist: only safe CSS selector characters — no backticks, braces, or semicolons
84
+ const SAFE_SELECTOR = /^[a-zA-Z0-9\s\[\]().#:*>,~+="'_-]+$/;
85
+ if (!SAFE_SELECTOR.test(raw.wait_for)) {
86
+ throw new Error("wait_for contains invalid characters for a CSS selector");
87
+ }
88
+ }
89
+ const timeout = raw.timeout !== undefined ? Number(raw.timeout) : 60;
90
+ if (!Number.isFinite(timeout) || timeout < 5 || timeout > 120)
91
+ throw new Error("timeout must be 5-120 seconds");
92
+ return {
93
+ url: raw.url,
94
+ format: raw.format || "markdown",
95
+ wait_for: raw.wait_for,
96
+ timeout,
97
+ };
98
+ }
@@ -0,0 +1,9 @@
1
+ import type { ProxyAdapter, ProxyCredentials } from "../adapters/index.js";
2
+ export interface ResearchParams {
3
+ query: string;
4
+ depth?: "quick" | "standard" | "deep";
5
+ country?: string;
6
+ timeout?: number;
7
+ }
8
+ export declare function novadaProxyResearch(params: ResearchParams, adapter: ProxyAdapter, credentials: ProxyCredentials, novadaApiKey: string): Promise<string>;
9
+ export declare function validateResearchParams(raw: Record<string, unknown>): ResearchParams;
@@ -0,0 +1,126 @@
1
+ import { novadaProxySearch } from "./search.js";
2
+ import { novadaProxyBatchFetch } from "./batch.js";
3
+ import { novadaProxyFetch } from "./fetch.js";
4
+ import { SAFE_COUNTRY } from "../validation.js";
5
+ const DEPTH_MAP = { quick: 3, standard: 5, deep: 10 };
6
+ export async function novadaProxyResearch(params, adapter, credentials, novadaApiKey) {
7
+ const { query, depth = "standard", country, timeout = 60 } = params;
8
+ const numSources = DEPTH_MAP[depth];
9
+ const wallStart = Date.now();
10
+ // Step 1: Search
11
+ const searchResult = await novadaProxySearch({ query, num: numSources, country }, novadaApiKey);
12
+ const searchParsed = JSON.parse(searchResult);
13
+ const searchResults = searchParsed.data.results || [];
14
+ if (searchResults.length === 0) {
15
+ return JSON.stringify({
16
+ ok: true,
17
+ tool: "novada_proxy_research",
18
+ data: {
19
+ query,
20
+ depth,
21
+ sources_searched: 0,
22
+ sources_fetched: 0,
23
+ sources_failed: 0,
24
+ findings: [],
25
+ urls: [],
26
+ findings_summary: "No search results found for this query.",
27
+ },
28
+ meta: { latency_ms: Date.now() - wallStart, quota: { credits_estimated: 1 } },
29
+ });
30
+ }
31
+ // Step 2: Fetch top results
32
+ const urls = searchResults.slice(0, numSources).map(r => r.url);
33
+ let batchResults = [];
34
+ if (urls.length === 1) {
35
+ // batch_fetch requires minimum 2 URLs — fetch single URL directly
36
+ try {
37
+ const fetchResult = await novadaProxyFetch({ url: urls[0], format: "markdown", country, timeout }, adapter, credentials);
38
+ const parsed = JSON.parse(fetchResult);
39
+ batchResults = [{ url: urls[0], ok: true, content: parsed.data.content }];
40
+ }
41
+ catch {
42
+ batchResults = [{ url: urls[0], ok: false, error: { code: "FETCH_FAILED", message: "Failed to fetch" } }];
43
+ }
44
+ }
45
+ else {
46
+ const batchResult = await novadaProxyBatchFetch({ urls, format: "markdown", country, timeout, concurrency: 3 }, adapter, credentials);
47
+ const batchParsed = JSON.parse(batchResult);
48
+ batchResults = batchParsed.data.results || [];
49
+ }
50
+ // Step 3: Extract findings
51
+ const findings = batchResults
52
+ .filter(r => r.ok && r.content)
53
+ .map(r => {
54
+ const content = r.content || "";
55
+ const titleMatch = content.match(/^#\s+(.+)/m);
56
+ const firstLine = content.split("\n").find(l => l.trim().length > 0)?.trim() || r.url;
57
+ const title = titleMatch ? titleMatch[1].trim() : firstLine;
58
+ const contentPreview = content.slice(0, 500).trim();
59
+ return {
60
+ title,
61
+ url: r.url,
62
+ snippet: searchResults.find(s => s.url === r.url)?.snippet || "",
63
+ content_preview: contentPreview,
64
+ };
65
+ });
66
+ // Step 4: Build findings summary (concatenated source previews — agent should analyze findings[] for deeper synthesis)
67
+ const summaryParts = findings.map(f => {
68
+ // Skip heading lines to get actual content for the summary
69
+ const paragraphs = f.content_preview.split("\n\n").filter(p => !p.trim().startsWith("#"));
70
+ const firstParagraph = paragraphs[0]?.trim() || f.snippet;
71
+ return `According to ${f.title} (${f.url}): ${firstParagraph}`;
72
+ });
73
+ const findings_summary = summaryParts.length > 0
74
+ ? summaryParts.join("\n\n")
75
+ : "Unable to build findings summary — all source fetches failed.";
76
+ const latency_ms = Date.now() - wallStart;
77
+ const sourcesFetched = batchResults.filter(r => r.ok).length;
78
+ const sourcesFailed = batchResults.filter(r => !r.ok).length;
79
+ return JSON.stringify({
80
+ ok: true,
81
+ tool: "novada_proxy_research",
82
+ data: {
83
+ query,
84
+ depth,
85
+ sources_searched: searchResults.length,
86
+ sources_fetched: sourcesFetched,
87
+ sources_failed: sourcesFailed,
88
+ findings,
89
+ urls: findings.map(f => f.url),
90
+ findings_summary,
91
+ },
92
+ meta: {
93
+ latency_ms,
94
+ quota: { credits_estimated: 1 + urls.length },
95
+ },
96
+ });
97
+ }
98
+ export function validateResearchParams(raw) {
99
+ if (!raw.query || typeof raw.query !== "string") {
100
+ throw new Error("query is required and must be a string");
101
+ }
102
+ if (raw.query.trim().length === 0) {
103
+ throw new Error("query must not be empty");
104
+ }
105
+ if (raw.query.length > 500) {
106
+ throw new Error("query must be 500 characters or less");
107
+ }
108
+ if (raw.depth !== undefined && !["quick", "standard", "deep"].includes(raw.depth)) {
109
+ throw new Error("depth must be 'quick', 'standard', or 'deep'");
110
+ }
111
+ if (raw.country !== undefined) {
112
+ if (typeof raw.country !== "string" || raw.country.length > 10 || !SAFE_COUNTRY.test(raw.country)) {
113
+ throw new Error("country must be a 2-letter ISO code with no hyphens (e.g. US, DE, GB)");
114
+ }
115
+ }
116
+ const timeout = raw.timeout !== undefined ? Number(raw.timeout) : 60;
117
+ if (!Number.isFinite(timeout) || timeout < 1 || timeout > 120) {
118
+ throw new Error("timeout must be between 1 and 120 seconds");
119
+ }
120
+ return {
121
+ query: raw.query.trim(),
122
+ depth: raw.depth || "standard",
123
+ country: raw.country,
124
+ timeout,
125
+ };
126
+ }