novada-proxy-core 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. package/build/adapters/brightdata.d.ts +24 -0
  2. package/build/adapters/brightdata.js +56 -0
  3. package/build/adapters/generic.d.ts +32 -0
  4. package/build/adapters/generic.js +63 -0
  5. package/build/adapters/index.d.ts +16 -0
  6. package/build/adapters/index.js +42 -0
  7. package/build/adapters/novada.d.ts +23 -0
  8. package/build/adapters/novada.js +61 -0
  9. package/build/adapters/oxylabs.d.ts +22 -0
  10. package/build/adapters/oxylabs.js +54 -0
  11. package/build/adapters/smartproxy.d.ts +22 -0
  12. package/build/adapters/smartproxy.js +54 -0
  13. package/build/adapters/types.d.ts +58 -0
  14. package/build/adapters/types.js +7 -0
  15. package/build/config.d.ts +4 -0
  16. package/build/config.js +7 -0
  17. package/build/errors.d.ts +2 -0
  18. package/build/errors.js +58 -0
  19. package/build/index.d.ts +28 -0
  20. package/build/index.js +22 -0
  21. package/build/redact.d.ts +2 -0
  22. package/build/redact.js +24 -0
  23. package/build/tools/batch.d.ts +24 -0
  24. package/build/tools/batch.js +156 -0
  25. package/build/tools/crawl.d.ts +33 -0
  26. package/build/tools/crawl.js +604 -0
  27. package/build/tools/extract.d.ts +22 -0
  28. package/build/tools/extract.js +454 -0
  29. package/build/tools/fetch.d.ts +17 -0
  30. package/build/tools/fetch.js +243 -0
  31. package/build/tools/index.d.ts +19 -0
  32. package/build/tools/index.js +10 -0
  33. package/build/tools/map.d.ts +19 -0
  34. package/build/tools/map.js +131 -0
  35. package/build/tools/render.d.ts +8 -0
  36. package/build/tools/render.js +98 -0
  37. package/build/tools/research.d.ts +9 -0
  38. package/build/tools/research.js +126 -0
  39. package/build/tools/search.d.ts +9 -0
  40. package/build/tools/search.js +104 -0
  41. package/build/tools/session.d.ts +12 -0
  42. package/build/tools/session.js +108 -0
  43. package/build/tools/status.d.ts +2 -0
  44. package/build/tools/status.js +66 -0
  45. package/build/types.d.ts +34 -0
  46. package/build/types.js +1 -0
  47. package/build/utils.d.ts +18 -0
  48. package/build/utils.js +151 -0
  49. package/build/validation.d.ts +4 -0
  50. package/build/validation.js +6 -0
  51. package/package.json +50 -0
@@ -0,0 +1,454 @@
1
+ import { novadaProxyFetch } from "./fetch.js";
2
+ import { novadaProxyRender } from "./render.js";
3
+ import { decodeHtmlEntities, htmlToMarkdown, unicodeSafeTruncate, countHtmlTags, contentDensity } from "../utils.js";
4
+ import { SAFE_COUNTRY, SAFE_CITY, SAFE_SESSION_ID } from "../validation.js";
5
+ /**
6
+ * Extract structured data from a URL using pattern matching on the fetched HTML.
7
+ *
8
+ * Strategy: fetch the raw HTML, then use regex + heuristic extraction for each
9
+ * requested field. This is a lightweight alternative to LLM-based extraction —
10
+ * fast, deterministic, and zero additional API cost.
11
+ *
12
+ * For more complex extraction needs, agents can use novada_proxy_fetch(format="raw")
13
+ * and do their own parsing.
14
+ */
15
+ // Error messages that indicate the proxy fetch failed hard and render may succeed
16
+ const RENDER_ESCALATION_PATTERNS = [
17
+ "TLS", "SSL", "socket disconnect", "secure TLS",
18
+ "ECONNRESET", "ECONNREFUSED",
19
+ "403", "blocked", "bot detection",
20
+ ];
21
+ export function shouldEscalateToRender(msg) {
22
+ return RENDER_ESCALATION_PATTERNS.some(p => msg.toLowerCase().includes(p.toLowerCase()));
23
+ }
24
+ export async function novadaProxyExtract(params, adapter, credentials, browserWsEndpoint) {
25
+ const { url, fields, schema, country, city, session_id, timeout = 60, render_fallback = false } = params;
26
+ const startTime = Date.now();
27
+ let html = "";
28
+ let usedRender = false;
29
+ let fetchWarning;
30
+ // Attempt 1: proxy fetch (fast, cheap)
31
+ try {
32
+ const fetchParams = { url, format: "raw", country, city, session_id, timeout };
33
+ const fetchResultStr = await novadaProxyFetch(fetchParams, adapter, credentials);
34
+ const fetchResult = JSON.parse(fetchResultStr);
35
+ html = fetchResult.data.content || "";
36
+ }
37
+ catch (err) {
38
+ const msg = err instanceof Error ? err.message : String(err);
39
+ // Auto-escalate to render if enabled and the error suggests a JS/TLS block
40
+ if (render_fallback && browserWsEndpoint && shouldEscalateToRender(msg)) {
41
+ try {
42
+ const renderResultStr = await novadaProxyRender({ url, format: "html", timeout }, browserWsEndpoint);
43
+ const renderResult = JSON.parse(renderResultStr);
44
+ html = renderResult.data.content || "";
45
+ usedRender = true;
46
+ fetchWarning = `Proxy fetch failed (${msg.slice(0, 80)}...) — escalated to render automatically`;
47
+ }
48
+ catch (renderErr) {
49
+ // Both failed — re-throw the original fetch error
50
+ throw err;
51
+ }
52
+ }
53
+ else {
54
+ throw err;
55
+ }
56
+ }
57
+ const latency_ms = Date.now() - startTime;
58
+ // ── Schema mode: LLM-ready extraction ─────────────────────────────────────
59
+ if (schema) {
60
+ const markdown = htmlToMarkdown(html);
61
+ const truncated = unicodeSafeTruncate(markdown, 50000);
62
+ const tagCount = countHtmlTags(html);
63
+ const density = contentDensity(truncated.length, tagCount);
64
+ const schemaEntries = Object.entries(schema)
65
+ .map(([key, desc]) => `- ${key}: ${desc}`)
66
+ .join("\n");
67
+ const extractionPrompt = `Extract the following fields from the page content provided in data.content of this response. Return ONLY a JSON object with the field names as keys. If a field cannot be found, set its value to null.\n\nFields to extract:\n${schemaEntries}`;
68
+ const result = {
69
+ ok: true,
70
+ tool: "novada_proxy_extract",
71
+ data: {
72
+ mode: "llm_extract",
73
+ url,
74
+ schema: schema,
75
+ content: truncated,
76
+ extraction_prompt: extractionPrompt,
77
+ content_length: truncated.length,
78
+ ...(fetchWarning ? { fetch_warning: fetchWarning } : {}),
79
+ },
80
+ meta: {
81
+ latency_ms,
82
+ country,
83
+ session_id,
84
+ content_density: density,
85
+ quota: {
86
+ credits_estimated: usedRender ? 5 : 1,
87
+ note: "Check dashboard.novada.com for real-time balance",
88
+ },
89
+ },
90
+ };
91
+ if (!result.meta.country)
92
+ delete result.meta.country;
93
+ if (!result.meta.session_id)
94
+ delete result.meta.session_id;
95
+ return JSON.stringify(result);
96
+ }
97
+ // ── Heuristic mode (default) ────────────────────────────────────────────────
98
+ // Extract each requested field using pattern-based heuristics
99
+ const extractedFields = {};
100
+ for (const field of fields) {
101
+ extractedFields[field] = extractField(html, field, url);
102
+ }
103
+ const result = {
104
+ ok: true,
105
+ tool: "novada_proxy_extract",
106
+ data: {
107
+ mode: "heuristic",
108
+ url,
109
+ fields: extractedFields,
110
+ ...(fetchWarning ? { fetch_warning: fetchWarning } : {}),
111
+ ...(usedRender ? { extracted_via: "render" } : { extracted_via: "proxy_fetch" }),
112
+ },
113
+ meta: {
114
+ latency_ms,
115
+ country,
116
+ session_id,
117
+ // render costs 5 credits when used as fallback, else 1
118
+ quota: {
119
+ credits_estimated: usedRender ? 5 : 1,
120
+ note: "Check dashboard.novada.com for real-time balance",
121
+ },
122
+ },
123
+ };
124
+ if (!result.meta.country)
125
+ delete result.meta.country;
126
+ if (!result.meta.session_id)
127
+ delete result.meta.session_id;
128
+ return JSON.stringify(result);
129
+ }
130
+ /**
131
+ * Heuristic field extraction from HTML.
132
+ *
133
+ * Uses common patterns: meta tags, Open Graph, Schema.org JSON-LD, headings,
134
+ * and semantic HTML. Falls back to regex scanning for common field names.
135
+ */
136
+ export function extractField(html, field, baseUrl) {
137
+ const f = field.toLowerCase().trim();
138
+ // --- Title ---
139
+ if (f === "title" || f === "name" || f === "product_name") {
140
+ return (extractMetaContent(html, "og:title") ??
141
+ extractJsonLd(html, "name") ??
142
+ extractTag(html, "title") ??
143
+ extractTag(html, "h1") ??
144
+ null);
145
+ }
146
+ // --- Price ---
147
+ if (f === "price" || f === "cost") {
148
+ return (extractJsonLd(html, "price") ??
149
+ extractJsonLd(html, "lowPrice") ??
150
+ extractMetaContent(html, "product:price:amount") ??
151
+ extractPriceFromHtml(html) ??
152
+ null);
153
+ }
154
+ // --- Currency ---
155
+ if (f === "currency") {
156
+ return (extractJsonLd(html, "priceCurrency") ??
157
+ extractMetaContent(html, "product:price:currency") ??
158
+ null);
159
+ }
160
+ // --- Description ---
161
+ if (f === "description" || f === "summary") {
162
+ return (extractMetaContent(html, "og:description") ??
163
+ extractMetaContent(html, "description") ??
164
+ extractJsonLd(html, "description") ??
165
+ null);
166
+ }
167
+ // --- Image ---
168
+ if (f === "image" || f === "thumbnail" || f === "photo") {
169
+ return (extractMetaContent(html, "og:image") ??
170
+ extractJsonLd(html, "image") ??
171
+ extractFirstImage(html, baseUrl) ??
172
+ null);
173
+ }
174
+ // --- Rating / Reviews ---
175
+ if (f === "rating" || f === "score") {
176
+ return (extractJsonLd(html, "ratingValue") ??
177
+ null);
178
+ }
179
+ if (f === "review_count" || f === "reviews" || f === "rating_count") {
180
+ return (extractJsonLd(html, "reviewCount") ??
181
+ extractJsonLd(html, "ratingCount") ??
182
+ null);
183
+ }
184
+ // --- Author ---
185
+ if (f === "author" || f === "creator") {
186
+ return (extractJsonLd(html, "author") ??
187
+ extractMetaContent(html, "author") ??
188
+ extractMetaContent(html, "article:author") ??
189
+ null);
190
+ }
191
+ // --- Date ---
192
+ if (f === "date" || f === "published" || f === "publish_date") {
193
+ return (extractJsonLd(html, "datePublished") ??
194
+ extractMetaContent(html, "article:published_time") ??
195
+ extractMetaContent(html, "date") ??
196
+ null);
197
+ }
198
+ // --- URL / Canonical ---
199
+ if (f === "url" || f === "canonical") {
200
+ return (extractMetaContent(html, "og:url") ??
201
+ extractCanonical(html) ??
202
+ null);
203
+ }
204
+ // --- Links (returns array) ---
205
+ if (f === "links" || f === "urls") {
206
+ return extractAllLinks(html, baseUrl);
207
+ }
208
+ // --- Headings (returns array) ---
209
+ if (f === "headings" || f === "h1" || f === "h2") {
210
+ const tag = f === "h2" ? "h2" : "h1";
211
+ return extractAllTags(html, tag);
212
+ }
213
+ // --- Generic fallback: try JSON-LD, then meta ---
214
+ return (extractJsonLd(html, field) ??
215
+ extractMetaContent(html, field) ??
216
+ null);
217
+ }
218
+ // --- Extraction helpers ---
219
+ function extractMetaContent(html, name) {
220
+ // Match both name= and property= attributes
221
+ const patterns = [
222
+ new RegExp(`<meta[^>]+(?:property|name)=["']${escapeRegex(name)}["'][^>]+content=["']([^"']+)["']`, "i"),
223
+ new RegExp(`<meta[^>]+content=["']([^"']+)["'][^>]+(?:property|name)=["']${escapeRegex(name)}["']`, "i"),
224
+ ];
225
+ for (const re of patterns) {
226
+ const m = html.match(re);
227
+ if (m?.[1])
228
+ return decodeHtmlEntities(m[1]);
229
+ }
230
+ return null;
231
+ }
232
+ function extractJsonLd(html, key) {
233
+ const ldBlocks = html.match(/<script[^>]+type=["']application\/ld\+json["'][^>]*>([\s\S]*?)<\/script>/gi);
234
+ if (!ldBlocks)
235
+ return null;
236
+ for (const block of ldBlocks) {
237
+ const jsonMatch = block.match(/>([^<]+)</s);
238
+ if (!jsonMatch?.[1])
239
+ continue;
240
+ try {
241
+ const data = JSON.parse(jsonMatch[1]);
242
+ const value = deepFind(data, key);
243
+ if (value !== undefined) {
244
+ return typeof value === "object" ? JSON.stringify(value) : String(value);
245
+ }
246
+ }
247
+ catch {
248
+ // malformed JSON-LD — skip
249
+ }
250
+ }
251
+ return null;
252
+ }
253
+ function extractTag(html, tag) {
254
+ const re = new RegExp(`<${tag}[^>]*>([^<]+)</${tag}>`, "i");
255
+ const m = html.match(re);
256
+ return m?.[1] ? decodeHtmlEntities(m[1]).trim() : null;
257
+ }
258
+ function extractAllTags(html, tag) {
259
+ const re = new RegExp(`<${tag}[^>]*>([^<]+)</${tag}>`, "gi");
260
+ const results = [];
261
+ let m;
262
+ while ((m = re.exec(html)) !== null) {
263
+ if (m[1])
264
+ results.push(decodeHtmlEntities(m[1]).trim());
265
+ }
266
+ return results.length ? results : [];
267
+ }
268
+ function extractAllLinks(html, baseUrl) {
269
+ const re = /<a[^>]+href=["']([^"']+)["']/gi;
270
+ const results = [];
271
+ let m;
272
+ while ((m = re.exec(html)) !== null) {
273
+ const href = m[1];
274
+ if (!href)
275
+ continue;
276
+ let resolved = href;
277
+ if (!href.startsWith("http") && baseUrl) {
278
+ try {
279
+ resolved = new URL(href, baseUrl).toString();
280
+ }
281
+ catch {
282
+ continue;
283
+ }
284
+ }
285
+ if (resolved.startsWith("http") && !results.includes(resolved)) {
286
+ results.push(resolved);
287
+ }
288
+ }
289
+ return results.slice(0, 50); // cap at 50 links
290
+ }
291
+ function extractCanonical(html) {
292
+ const m = html.match(/<link[^>]+rel=["']canonical["'][^>]+href=["']([^"']+)["']/i);
293
+ return m?.[1] ?? null;
294
+ }
295
+ const SKIP_IMAGE_PATTERNS = /icon|logo|pixel|tracking|1x1|spacer/i;
296
+ function extractFirstImage(html, baseUrl) {
297
+ const re = /<img[^>]+src=["']([^"']+)["']/gi;
298
+ let m;
299
+ while ((m = re.exec(html)) !== null) {
300
+ const src = m[1];
301
+ if (!src || SKIP_IMAGE_PATTERNS.test(src))
302
+ continue;
303
+ if (src.startsWith("data:"))
304
+ continue;
305
+ if (src.startsWith("http"))
306
+ return src;
307
+ if (baseUrl) {
308
+ try {
309
+ return new URL(src, baseUrl).toString();
310
+ }
311
+ catch {
312
+ continue;
313
+ }
314
+ }
315
+ }
316
+ return null;
317
+ }
318
+ function extractPriceFromHtml(html) {
319
+ // Strategy 1: price inside an element with "price" in its class (most reliable)
320
+ const priceClassRe = /(?:class=["'][^"']*price[^"']*["'][^>]*>)\s*[^<]*?([$€£¥]\s*[\d,.]+)/i;
321
+ const m = priceClassRe.exec(html);
322
+ if (m?.[1])
323
+ return m[1].trim();
324
+ // Strategy 2: collect ALL currency patterns, return the most likely product price
325
+ // (skip very small amounts like $0, $1, $2 which are often shipping thresholds or discounts)
326
+ const allPricesRe = /([$€£¥])\s*(\d[\d,.]*)/g;
327
+ const prices = [];
328
+ let pm;
329
+ while ((pm = allPricesRe.exec(html)) !== null) {
330
+ const raw = `${pm[1]}${pm[2]}`.trim();
331
+ const value = parseFloat(pm[2].replace(/,/g, ""));
332
+ if (Number.isFinite(value) && value > 0)
333
+ prices.push({ raw, value });
334
+ }
335
+ if (prices.length === 0)
336
+ return null;
337
+ // Filter out likely noise: prices under $5 are usually shipping/discount thresholds
338
+ const plausible = prices.filter(p => p.value >= 5);
339
+ if (plausible.length > 0) {
340
+ // Return the first plausible price (most likely the product price in page order)
341
+ return plausible[0].raw;
342
+ }
343
+ // If all prices are under $5, return the highest one
344
+ prices.sort((a, b) => b.value - a.value);
345
+ return prices[0].raw;
346
+ }
347
+ export function deepFind(obj, key, depth = 0) {
348
+ if (depth > 20)
349
+ return undefined;
350
+ if (obj === null || obj === undefined)
351
+ return undefined;
352
+ if (typeof obj !== "object")
353
+ return undefined;
354
+ if (Array.isArray(obj)) {
355
+ for (const item of obj) {
356
+ const found = deepFind(item, key, depth + 1);
357
+ if (found !== undefined)
358
+ return found;
359
+ }
360
+ return undefined;
361
+ }
362
+ const record = obj;
363
+ if (key in record)
364
+ return record[key];
365
+ for (const v of Object.values(record)) {
366
+ const found = deepFind(v, key, depth + 1);
367
+ if (found !== undefined)
368
+ return found;
369
+ }
370
+ return undefined;
371
+ }
372
+ function escapeRegex(s) {
373
+ return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
374
+ }
375
+ // --- Validation ---
376
+ const KEY_PATTERN = /^[a-zA-Z][a-zA-Z0-9_]*$/;
377
+ export function validateExtractParams(raw) {
378
+ if (!raw.url || typeof raw.url !== "string") {
379
+ throw new Error("url is required and must be a string");
380
+ }
381
+ if (!raw.url.startsWith("http://") && !raw.url.startsWith("https://")) {
382
+ throw new Error("url must start with http:// or https://");
383
+ }
384
+ // Validate schema (takes precedence over fields)
385
+ let schema;
386
+ if (raw.schema !== undefined) {
387
+ if (typeof raw.schema !== "object" || raw.schema === null || Array.isArray(raw.schema)) {
388
+ throw new Error("schema must be an object with string keys and string values");
389
+ }
390
+ const schemaObj = raw.schema;
391
+ const keys = Object.keys(schemaObj);
392
+ if (keys.length === 0) {
393
+ throw new Error("schema must have at least 1 field");
394
+ }
395
+ if (keys.length > 20) {
396
+ throw new Error("schema must have at most 20 fields");
397
+ }
398
+ for (const [key, val] of Object.entries(schemaObj)) {
399
+ if (!KEY_PATTERN.test(key) || key.length > 50) {
400
+ throw new Error("schema keys must be alphanumeric/underscore, start with letter, max 50 chars");
401
+ }
402
+ if (typeof val !== "string" || val.trim().length === 0) {
403
+ throw new Error("schema values must be non-empty strings");
404
+ }
405
+ if (val.length > 200) {
406
+ throw new Error("schema value descriptions must be 200 characters or less");
407
+ }
408
+ }
409
+ schema = schemaObj;
410
+ }
411
+ // fields is required only when schema is not provided
412
+ if (!schema) {
413
+ if (!raw.fields || !Array.isArray(raw.fields) || raw.fields.length === 0) {
414
+ throw new Error("fields is required — provide an array of field names to extract (e.g. [\"title\", \"price\", \"description\"])");
415
+ }
416
+ if (raw.fields.length > 20) {
417
+ throw new Error("fields must contain at most 20 field names");
418
+ }
419
+ for (const f of raw.fields) {
420
+ if (typeof f !== "string" || f.length > 50) {
421
+ throw new Error("each field must be a string of 50 characters or less");
422
+ }
423
+ }
424
+ }
425
+ if (raw.country !== undefined) {
426
+ if (typeof raw.country !== "string" || raw.country.length > 10 || !SAFE_COUNTRY.test(raw.country)) {
427
+ throw new Error("country must be a 2-letter ISO code with no hyphens (e.g. US, DE, GB)");
428
+ }
429
+ }
430
+ if (raw.city !== undefined) {
431
+ if (typeof raw.city !== "string" || raw.city.length > 50 || !SAFE_CITY.test(raw.city)) {
432
+ throw new Error("city must contain only letters, numbers, underscores, max 50 chars");
433
+ }
434
+ }
435
+ if (raw.session_id !== undefined) {
436
+ if (typeof raw.session_id !== "string" || raw.session_id.length > 64 || !SAFE_SESSION_ID.test(raw.session_id)) {
437
+ throw new Error("session_id must contain only letters, numbers, and underscores, max 64 chars (no hyphens)");
438
+ }
439
+ }
440
+ const timeout = raw.timeout !== undefined ? Number(raw.timeout) : 60;
441
+ if (!Number.isFinite(timeout) || timeout < 1 || timeout > 120) {
442
+ throw new Error("timeout must be between 1 and 120 seconds");
443
+ }
444
+ return {
445
+ url: raw.url,
446
+ fields: raw.fields ?? [],
447
+ schema,
448
+ country: raw.country,
449
+ city: raw.city,
450
+ session_id: raw.session_id,
451
+ timeout,
452
+ render_fallback: raw.render_fallback === true,
453
+ };
454
+ }
@@ -0,0 +1,17 @@
1
+ import type { ProxyAdapter, ProxyCredentials } from "../adapters/index.js";
2
+ /** Returns the configured TTL in seconds. 0 = cache disabled. */
3
+ export declare function getCacheTtl(): number;
4
+ /** Cache key: url + format + country (country affects what you receive back). */
5
+ export declare function makeCacheKey(url: string, format: string, country?: string): string;
6
+ /** Clear the entire cache (useful for tests and manual cache invalidation). */
7
+ export declare function clearResponseCache(): void;
8
+ export interface FetchParams {
9
+ url: string;
10
+ country?: string;
11
+ city?: string;
12
+ session_id?: string;
13
+ format?: "raw" | "markdown";
14
+ timeout?: number;
15
+ }
16
+ export declare function novadaProxyFetch(params: FetchParams, adapter: ProxyAdapter, credentials: ProxyCredentials): Promise<string>;
17
+ export declare function validateFetchParams(raw: Record<string, unknown>): FetchParams;