novada-proxy-core 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/adapters/brightdata.d.ts +24 -0
- package/build/adapters/brightdata.js +56 -0
- package/build/adapters/generic.d.ts +32 -0
- package/build/adapters/generic.js +63 -0
- package/build/adapters/index.d.ts +16 -0
- package/build/adapters/index.js +42 -0
- package/build/adapters/novada.d.ts +23 -0
- package/build/adapters/novada.js +61 -0
- package/build/adapters/oxylabs.d.ts +22 -0
- package/build/adapters/oxylabs.js +54 -0
- package/build/adapters/smartproxy.d.ts +22 -0
- package/build/adapters/smartproxy.js +54 -0
- package/build/adapters/types.d.ts +58 -0
- package/build/adapters/types.js +7 -0
- package/build/config.d.ts +4 -0
- package/build/config.js +7 -0
- package/build/errors.d.ts +2 -0
- package/build/errors.js +58 -0
- package/build/index.d.ts +28 -0
- package/build/index.js +22 -0
- package/build/redact.d.ts +2 -0
- package/build/redact.js +24 -0
- package/build/tools/batch.d.ts +24 -0
- package/build/tools/batch.js +156 -0
- package/build/tools/crawl.d.ts +33 -0
- package/build/tools/crawl.js +604 -0
- package/build/tools/extract.d.ts +22 -0
- package/build/tools/extract.js +454 -0
- package/build/tools/fetch.d.ts +17 -0
- package/build/tools/fetch.js +243 -0
- package/build/tools/index.d.ts +19 -0
- package/build/tools/index.js +10 -0
- package/build/tools/map.d.ts +19 -0
- package/build/tools/map.js +131 -0
- package/build/tools/render.d.ts +8 -0
- package/build/tools/render.js +98 -0
- package/build/tools/research.d.ts +9 -0
- package/build/tools/research.js +126 -0
- package/build/tools/search.d.ts +9 -0
- package/build/tools/search.js +104 -0
- package/build/tools/session.d.ts +12 -0
- package/build/tools/session.js +108 -0
- package/build/tools/status.d.ts +2 -0
- package/build/tools/status.js +66 -0
- package/build/types.d.ts +34 -0
- package/build/types.js +1 -0
- package/build/utils.d.ts +18 -0
- package/build/utils.js +151 -0
- package/build/validation.d.ts +4 -0
- package/build/validation.js +6 -0
- package/package.json +50 -0
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
export interface SearchParams {
|
|
2
|
+
query: string;
|
|
3
|
+
engine?: "google";
|
|
4
|
+
num?: number;
|
|
5
|
+
country?: string;
|
|
6
|
+
language?: string;
|
|
7
|
+
}
|
|
8
|
+
export declare function novadaProxySearch(params: SearchParams, novadaApiKey: string): Promise<string>;
|
|
9
|
+
export declare function validateSearchParams(raw: Record<string, unknown>): SearchParams;
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
import axios from "axios";
|
|
2
|
+
import { NOVADA_SEARCH_URL, DEFAULT_USER_AGENT } from "../config.js";
|
|
3
|
+
import { QUOTA_NOTE } from "../validation.js";
|
|
4
|
+
const SAFE_LOCALE = /^[a-zA-Z0-9_-]{1,10}$/;
|
|
5
|
+
export async function novadaProxySearch(params, novadaApiKey) {
|
|
6
|
+
const { query, engine = "google", num = 10, country = "", language = "" } = params;
|
|
7
|
+
// Guard: validate locale params even when called directly (not via validateSearchParams)
|
|
8
|
+
if (country && !SAFE_LOCALE.test(country))
|
|
9
|
+
throw new Error("country contains invalid characters");
|
|
10
|
+
if (language && !SAFE_LOCALE.test(language))
|
|
11
|
+
throw new Error("language contains invalid characters");
|
|
12
|
+
// Note: Novada Scraper API authenticates via query param (api_key), not header.
|
|
13
|
+
// The key is therefore visible in server-side access logs — this is an API design
|
|
14
|
+
// constraint of the current Novada endpoint. We mitigate by never including the
|
|
15
|
+
// key in error messages surfaced to the agent (see sanitizeMessage below).
|
|
16
|
+
const searchParams = new URLSearchParams({
|
|
17
|
+
q: query,
|
|
18
|
+
api_key: novadaApiKey,
|
|
19
|
+
engine,
|
|
20
|
+
num: String(num),
|
|
21
|
+
});
|
|
22
|
+
if (country)
|
|
23
|
+
searchParams.set("country", country);
|
|
24
|
+
if (language)
|
|
25
|
+
searchParams.set("language", language);
|
|
26
|
+
const requestUrl = `${NOVADA_SEARCH_URL}?${searchParams.toString()}`;
|
|
27
|
+
const startTime = Date.now();
|
|
28
|
+
let response;
|
|
29
|
+
try {
|
|
30
|
+
response = await axios.get(requestUrl, {
|
|
31
|
+
headers: {
|
|
32
|
+
"User-Agent": DEFAULT_USER_AGENT,
|
|
33
|
+
Origin: "https://www.novada.com",
|
|
34
|
+
Referer: "https://www.novada.com/",
|
|
35
|
+
},
|
|
36
|
+
timeout: 30000,
|
|
37
|
+
});
|
|
38
|
+
}
|
|
39
|
+
catch (err) {
|
|
40
|
+
// Sanitize: never surface the request URL (contains api_key) in error messages
|
|
41
|
+
// Sanitize api_key from all error paths — it's embedded in the request URL
|
|
42
|
+
const sanitize = (s) => s.replaceAll(novadaApiKey, "***")
|
|
43
|
+
.replaceAll(encodeURIComponent(novadaApiKey), "***");
|
|
44
|
+
if (axios.isAxiosError(err)) {
|
|
45
|
+
const status = err.response?.status;
|
|
46
|
+
const msg = sanitize(String(err.response?.data?.msg || err.message));
|
|
47
|
+
throw new Error(status ? `Search API HTTP ${status}: ${msg}` : `Search API error: ${msg}`);
|
|
48
|
+
}
|
|
49
|
+
throw new Error(sanitize(String(err instanceof Error ? err.message : err)));
|
|
50
|
+
}
|
|
51
|
+
const latency_ms = Date.now() - startTime;
|
|
52
|
+
const data = response.data;
|
|
53
|
+
if (data.code && data.code !== 200 && data.code !== 0) {
|
|
54
|
+
throw new Error(`Novada search error (${data.code}): ${String(data.msg || "unknown")}`);
|
|
55
|
+
}
|
|
56
|
+
const rawResults = data.data?.organic_results || data.organic_results || data.data?.results || data.results || [];
|
|
57
|
+
const results = rawResults.slice(0, num).map(r => ({
|
|
58
|
+
title: r.title || "Untitled",
|
|
59
|
+
url: r.redirection_link || r.url || r.link || "",
|
|
60
|
+
snippet: r.description || r.snippet || "",
|
|
61
|
+
}));
|
|
62
|
+
const result = {
|
|
63
|
+
ok: true,
|
|
64
|
+
tool: "novada_proxy_search",
|
|
65
|
+
data: {
|
|
66
|
+
query,
|
|
67
|
+
engine,
|
|
68
|
+
count: results.length,
|
|
69
|
+
results,
|
|
70
|
+
},
|
|
71
|
+
meta: {
|
|
72
|
+
latency_ms,
|
|
73
|
+
quota: { credits_estimated: 1, note: QUOTA_NOTE },
|
|
74
|
+
},
|
|
75
|
+
};
|
|
76
|
+
return JSON.stringify(result);
|
|
77
|
+
}
|
|
78
|
+
export function validateSearchParams(raw) {
|
|
79
|
+
if (!raw.query || typeof raw.query !== "string") {
|
|
80
|
+
throw new Error("query is required");
|
|
81
|
+
}
|
|
82
|
+
if (raw.query.length > 500) {
|
|
83
|
+
throw new Error("query must be 500 characters or less");
|
|
84
|
+
}
|
|
85
|
+
if (raw.engine && raw.engine !== "google") {
|
|
86
|
+
throw new Error("engine must be 'google' — other engines have known quality issues");
|
|
87
|
+
}
|
|
88
|
+
const num = raw.num !== undefined ? Number(raw.num) : 10;
|
|
89
|
+
if (!Number.isFinite(num) || num < 1 || num > 20)
|
|
90
|
+
throw new Error("num must be between 1 and 20");
|
|
91
|
+
if (raw.country && (typeof raw.country !== "string" || !SAFE_LOCALE.test(raw.country))) {
|
|
92
|
+
throw new Error("country must be a short locale code (e.g. us, uk, de)");
|
|
93
|
+
}
|
|
94
|
+
if (raw.language && (typeof raw.language !== "string" || !SAFE_LOCALE.test(raw.language))) {
|
|
95
|
+
throw new Error("language must be a short language code (e.g. en, zh, de)");
|
|
96
|
+
}
|
|
97
|
+
return {
|
|
98
|
+
query: raw.query,
|
|
99
|
+
engine: raw.engine || "google",
|
|
100
|
+
num,
|
|
101
|
+
country: raw.country || "",
|
|
102
|
+
language: raw.language || "",
|
|
103
|
+
};
|
|
104
|
+
}
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
import type { ProxyAdapter, ProxyCredentials } from "../adapters/index.js";
|
|
2
|
+
export interface SessionParams {
|
|
3
|
+
session_id: string;
|
|
4
|
+
url: string;
|
|
5
|
+
country?: string;
|
|
6
|
+
city?: string;
|
|
7
|
+
format?: "raw" | "markdown";
|
|
8
|
+
timeout?: number;
|
|
9
|
+
verify_sticky?: boolean;
|
|
10
|
+
}
|
|
11
|
+
export declare function novadaProxySession(params: SessionParams, adapter: ProxyAdapter, credentials: ProxyCredentials): Promise<string>;
|
|
12
|
+
export declare function validateSessionParams(raw: Record<string, unknown>): SessionParams;
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
import axios from "axios";
|
|
2
|
+
import { HttpsProxyAgent } from "https-proxy-agent";
|
|
3
|
+
import { novadaProxyFetch } from "./fetch.js";
|
|
4
|
+
import { SAFE_COUNTRY, SAFE_CITY, SAFE_SESSION_ID, QUOTA_NOTE } from "../validation.js";
|
|
5
|
+
export async function novadaProxySession(params, adapter, credentials) {
|
|
6
|
+
const { verify_sticky = false } = params;
|
|
7
|
+
// Make the main fetch call
|
|
8
|
+
const fetchResultStr = await novadaProxyFetch({
|
|
9
|
+
url: params.url,
|
|
10
|
+
session_id: params.session_id,
|
|
11
|
+
country: params.country,
|
|
12
|
+
city: params.city,
|
|
13
|
+
format: params.format || "markdown",
|
|
14
|
+
timeout: params.timeout,
|
|
15
|
+
}, adapter, credentials);
|
|
16
|
+
// Parse the fetch result JSON
|
|
17
|
+
let fetchResult;
|
|
18
|
+
try {
|
|
19
|
+
fetchResult = JSON.parse(fetchResultStr);
|
|
20
|
+
}
|
|
21
|
+
catch {
|
|
22
|
+
// Fallback: return raw fetch result if JSON parsing fails
|
|
23
|
+
return fetchResultStr;
|
|
24
|
+
}
|
|
25
|
+
// If verify_sticky requested, make a second call to httpbin.org/ip with same session
|
|
26
|
+
let session_verified;
|
|
27
|
+
if (verify_sticky && adapter.capabilities.sticky) {
|
|
28
|
+
try {
|
|
29
|
+
const proxyUrl = adapter.buildProxyUrl(credentials, {
|
|
30
|
+
session_id: params.session_id,
|
|
31
|
+
country: params.country,
|
|
32
|
+
});
|
|
33
|
+
const httpsAgent = new HttpsProxyAgent(proxyUrl);
|
|
34
|
+
// First IP check via httpbin
|
|
35
|
+
const ip1Resp = await axios.get("https://httpbin.org/ip", {
|
|
36
|
+
httpsAgent,
|
|
37
|
+
proxy: false,
|
|
38
|
+
timeout: 15000,
|
|
39
|
+
});
|
|
40
|
+
const ip1 = ip1Resp.data.origin?.split(",")[0]?.trim();
|
|
41
|
+
// Second IP check — same session, should return same IP
|
|
42
|
+
const ip2Resp = await axios.get("https://httpbin.org/ip", {
|
|
43
|
+
httpsAgent,
|
|
44
|
+
proxy: false,
|
|
45
|
+
timeout: 15000,
|
|
46
|
+
});
|
|
47
|
+
const ip2 = ip2Resp.data.origin?.split(",")[0]?.trim();
|
|
48
|
+
session_verified = ip1 !== undefined && ip2 !== undefined && ip1 === ip2;
|
|
49
|
+
}
|
|
50
|
+
catch {
|
|
51
|
+
// Verification call failed — leave session_verified undefined
|
|
52
|
+
session_verified = false;
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
// credits: 1 base + 2 for verify_sticky (2 httpbin calls)
|
|
56
|
+
const creditsEstimated = verify_sticky ? 3 : 1;
|
|
57
|
+
// Rebuild response with session_verified in meta
|
|
58
|
+
const result = {
|
|
59
|
+
...fetchResult,
|
|
60
|
+
tool: "novada_proxy_session",
|
|
61
|
+
meta: {
|
|
62
|
+
...fetchResult.meta,
|
|
63
|
+
session_id: params.session_id,
|
|
64
|
+
session_verified,
|
|
65
|
+
quota: { credits_estimated: creditsEstimated, note: QUOTA_NOTE },
|
|
66
|
+
},
|
|
67
|
+
};
|
|
68
|
+
if (result.meta.session_verified === undefined)
|
|
69
|
+
delete result.meta.session_verified;
|
|
70
|
+
return JSON.stringify(result);
|
|
71
|
+
}
|
|
72
|
+
export function validateSessionParams(raw) {
|
|
73
|
+
if (!raw.session_id || typeof raw.session_id !== "string" || raw.session_id.length > 64 || !SAFE_SESSION_ID.test(raw.session_id)) {
|
|
74
|
+
throw new Error("session_id is required — letters, numbers, underscores only, max 64 chars (no hyphens)");
|
|
75
|
+
}
|
|
76
|
+
if (!raw.url || typeof raw.url !== "string") {
|
|
77
|
+
throw new Error("url is required");
|
|
78
|
+
}
|
|
79
|
+
if (!raw.url.startsWith("http://") && !raw.url.startsWith("https://")) {
|
|
80
|
+
throw new Error("url must start with http:// or https://");
|
|
81
|
+
}
|
|
82
|
+
if (raw.country !== undefined) {
|
|
83
|
+
if (typeof raw.country !== "string" || raw.country.length > 10 || !SAFE_COUNTRY.test(raw.country)) {
|
|
84
|
+
throw new Error("country must be a 2-letter ISO code with no hyphens (e.g. US, DE, GB)");
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
if (raw.city !== undefined) {
|
|
88
|
+
if (typeof raw.city !== "string" || raw.city.length > 50 || !SAFE_CITY.test(raw.city)) {
|
|
89
|
+
throw new Error("city must contain only letters, numbers, underscores, max 50 chars (e.g. newyork, london)");
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
if (raw.format && raw.format !== "raw" && raw.format !== "markdown") {
|
|
93
|
+
throw new Error("format must be 'raw' or 'markdown'");
|
|
94
|
+
}
|
|
95
|
+
const timeout = raw.timeout !== undefined ? Number(raw.timeout) : 60;
|
|
96
|
+
if (!Number.isFinite(timeout) || timeout < 1 || timeout > 120) {
|
|
97
|
+
throw new Error("timeout must be between 1 and 120 seconds");
|
|
98
|
+
}
|
|
99
|
+
return {
|
|
100
|
+
session_id: raw.session_id,
|
|
101
|
+
url: raw.url,
|
|
102
|
+
country: raw.country,
|
|
103
|
+
city: raw.city,
|
|
104
|
+
format: raw.format || "markdown",
|
|
105
|
+
timeout,
|
|
106
|
+
verify_sticky: raw.verify_sticky === true,
|
|
107
|
+
};
|
|
108
|
+
}
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
import axios from "axios";
|
|
2
|
+
import { HttpsProxyAgent } from "https-proxy-agent";
|
|
3
|
+
import { VERSION } from "../config.js";
|
|
4
|
+
import { QUOTA_NOTE } from "../validation.js";
|
|
5
|
+
export async function novadaProxyStatus(adapter, credentials) {
|
|
6
|
+
const startTime = Date.now();
|
|
7
|
+
let connectivity_status = "UNAVAILABLE";
|
|
8
|
+
let proxy_ip;
|
|
9
|
+
const verified_via = "https://httpbin.org/ip";
|
|
10
|
+
if (adapter && credentials) {
|
|
11
|
+
try {
|
|
12
|
+
const proxyUrl = adapter.buildProxyUrl(credentials, {});
|
|
13
|
+
const httpsAgent = new HttpsProxyAgent(proxyUrl);
|
|
14
|
+
const response = await axios.get(verified_via, {
|
|
15
|
+
httpsAgent,
|
|
16
|
+
proxy: false,
|
|
17
|
+
timeout: 10000,
|
|
18
|
+
});
|
|
19
|
+
const ip = response.data.origin?.split(",")[0]?.trim();
|
|
20
|
+
if (ip) {
|
|
21
|
+
proxy_ip = ip;
|
|
22
|
+
connectivity_status = "HEALTHY";
|
|
23
|
+
}
|
|
24
|
+
else {
|
|
25
|
+
connectivity_status = "DEGRADED";
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
catch {
|
|
29
|
+
connectivity_status = "UNAVAILABLE";
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
const latency_ms = Date.now() - startTime;
|
|
33
|
+
const capabilities = [];
|
|
34
|
+
if (adapter) {
|
|
35
|
+
if (adapter.capabilities.country)
|
|
36
|
+
capabilities.push("country_targeting");
|
|
37
|
+
if (adapter.capabilities.city)
|
|
38
|
+
capabilities.push("city_targeting");
|
|
39
|
+
if (adapter.capabilities.sticky)
|
|
40
|
+
capabilities.push("sticky_sessions");
|
|
41
|
+
}
|
|
42
|
+
const result = {
|
|
43
|
+
ok: true,
|
|
44
|
+
tool: "novada_proxy_status",
|
|
45
|
+
data: {
|
|
46
|
+
provider: adapter?.displayName || "none configured",
|
|
47
|
+
version: VERSION,
|
|
48
|
+
capabilities,
|
|
49
|
+
connectivity: {
|
|
50
|
+
status: connectivity_status,
|
|
51
|
+
verified_via,
|
|
52
|
+
proxy_ip,
|
|
53
|
+
latency_ms,
|
|
54
|
+
},
|
|
55
|
+
},
|
|
56
|
+
meta: {
|
|
57
|
+
latency_ms,
|
|
58
|
+
quota: { credits_estimated: 1, note: QUOTA_NOTE },
|
|
59
|
+
},
|
|
60
|
+
};
|
|
61
|
+
// Remove undefined proxy_ip
|
|
62
|
+
if (!proxy_ip) {
|
|
63
|
+
delete result.data.connectivity.proxy_ip;
|
|
64
|
+
}
|
|
65
|
+
return JSON.stringify(result);
|
|
66
|
+
}
|
package/build/types.d.ts
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
export type ProxyErrorCode = "BOT_DETECTION_SUSPECTED" | "SESSION_STICKINESS_FAILED" | "RATE_LIMITED" | "INVALID_INPUT" | "TIMEOUT" | "TLS_ERROR" | "NETWORK_ERROR" | "PROVIDER_NOT_CONFIGURED" | "UNKNOWN_ERROR";
|
|
2
|
+
export interface QuotaMeta {
|
|
3
|
+
credits_estimated: number;
|
|
4
|
+
note: string;
|
|
5
|
+
}
|
|
6
|
+
export interface ProxySuccessResponse {
|
|
7
|
+
ok: true;
|
|
8
|
+
tool: string;
|
|
9
|
+
data: Record<string, unknown>;
|
|
10
|
+
meta: {
|
|
11
|
+
latency_ms: number;
|
|
12
|
+
proxy_ip?: string;
|
|
13
|
+
country?: string;
|
|
14
|
+
session_id?: string;
|
|
15
|
+
session_verified?: boolean;
|
|
16
|
+
truncated?: boolean;
|
|
17
|
+
content_density?: number;
|
|
18
|
+
concurrency?: number;
|
|
19
|
+
quota?: QuotaMeta;
|
|
20
|
+
cache_hit?: boolean;
|
|
21
|
+
cache_age_seconds?: number;
|
|
22
|
+
};
|
|
23
|
+
}
|
|
24
|
+
export interface ProxyErrorResponse {
|
|
25
|
+
ok: false;
|
|
26
|
+
error: {
|
|
27
|
+
code: ProxyErrorCode;
|
|
28
|
+
message: string;
|
|
29
|
+
recoverable: boolean;
|
|
30
|
+
agent_instruction: string;
|
|
31
|
+
retry_after_seconds?: number;
|
|
32
|
+
};
|
|
33
|
+
}
|
|
34
|
+
export type ProxyResponse = ProxySuccessResponse | ProxyErrorResponse;
|
package/build/types.js
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
package/build/utils.d.ts
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
export declare function unicodeSafeTruncate(s: string, maxChars: number): string;
|
|
2
|
+
export declare function decodeHtmlEntities(s: string): string;
|
|
3
|
+
/**
|
|
4
|
+
* Remove noise elements from HTML BEFORE markdown conversion.
|
|
5
|
+
* Conservative — only strips elements where the tag name or class/id strongly indicates noise.
|
|
6
|
+
*/
|
|
7
|
+
export declare function stripNoiseElements(html: string): string;
|
|
8
|
+
export declare function htmlToMarkdown(html: string): string;
|
|
9
|
+
/**
|
|
10
|
+
* Count rough number of HTML tags in a string.
|
|
11
|
+
*/
|
|
12
|
+
export declare function countHtmlTags(html: string): number;
|
|
13
|
+
/**
|
|
14
|
+
* Compute content density score: ratio of text content to total content + tag overhead.
|
|
15
|
+
* Higher = cleaner content. Range: 0.0 to 1.0.
|
|
16
|
+
*/
|
|
17
|
+
export declare function contentDensity(markdownLength: number, tagCount: number): number;
|
|
18
|
+
export declare function htmlToText(html: string): string;
|
package/build/utils.js
ADDED
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
export function unicodeSafeTruncate(s, maxChars) {
|
|
2
|
+
if (s.length <= maxChars)
|
|
3
|
+
return s;
|
|
4
|
+
let end = maxChars;
|
|
5
|
+
// Don't split a surrogate pair at the boundary
|
|
6
|
+
const code = s.charCodeAt(end - 1);
|
|
7
|
+
if (code >= 0xD800 && code <= 0xDBFF)
|
|
8
|
+
end--; // high surrogate stranded → drop it
|
|
9
|
+
else if (code >= 0xDC00 && code <= 0xDFFF)
|
|
10
|
+
end -= 2; // low surrogate → drop the whole pair
|
|
11
|
+
return s.slice(0, end);
|
|
12
|
+
}
|
|
13
|
+
export function decodeHtmlEntities(s) {
|
|
14
|
+
return s
|
|
15
|
+
.replace(/&/g, "&")
|
|
16
|
+
.replace(/</g, "<")
|
|
17
|
+
.replace(/>/g, ">")
|
|
18
|
+
.replace(/"/g, '"')
|
|
19
|
+
.replace(/'/g, "'")
|
|
20
|
+
.replace(/ /g, " ");
|
|
21
|
+
}
|
|
22
|
+
/**
|
|
23
|
+
* Noise class/id patterns that strongly indicate non-content elements.
|
|
24
|
+
* Conservative: only match when the pattern is a clear indicator of noise.
|
|
25
|
+
*/
|
|
26
|
+
const NOISE_ATTR_PATTERN = /\b(cookie[-_]?banner|cookie[-_]?consent|cookie[-_]?notice|popup|modal|overlay|sidebar|nav[-_]?bar|navigation|footer|header|advertisement|ad[-_]?banner|social[-_]?share|share[-_]?buttons|comments?[-_]?section|menu[-_]?toggle|skip[-_]?nav|breadcrumb)\b/i;
|
|
27
|
+
/**
|
|
28
|
+
* Remove noise elements from HTML BEFORE markdown conversion.
|
|
29
|
+
* Conservative — only strips elements where the tag name or class/id strongly indicates noise.
|
|
30
|
+
*/
|
|
31
|
+
export function stripNoiseElements(html) {
|
|
32
|
+
let result = html;
|
|
33
|
+
// 1. Strip structural noise tags and their content: nav, header, footer, aside, form
|
|
34
|
+
result = result.replace(/<nav[\s>][\s\S]*?<\/nav>/gi, "");
|
|
35
|
+
result = result.replace(/<header[\s>][\s\S]*?<\/header>/gi, "");
|
|
36
|
+
result = result.replace(/<footer[\s>][\s\S]*?<\/footer>/gi, "");
|
|
37
|
+
result = result.replace(/<aside[\s>][\s\S]*?<\/aside>/gi, "");
|
|
38
|
+
result = result.replace(/<form[\s>][\s\S]*?<\/form>/gi, "");
|
|
39
|
+
// 2. Strip elements with noise class/id patterns
|
|
40
|
+
// Match opening tags with class="..." or id="..." containing noise keywords,
|
|
41
|
+
// then remove through the matching closing tag.
|
|
42
|
+
// We handle <div>, <section>, <span>, <ul>, <ol> with noise attributes.
|
|
43
|
+
const noiseTagNames = ["div", "section", "span", "ul", "ol", "p"];
|
|
44
|
+
for (const tag of noiseTagNames) {
|
|
45
|
+
// Match opening tag with class or id containing noise pattern
|
|
46
|
+
const openTagRe = new RegExp(`<${tag}\\s[^>]*(?:class|id)\\s*=\\s*["'][^"']*${NOISE_ATTR_PATTERN.source}[^"']*["'][^>]*>`, "gi");
|
|
47
|
+
// For each match, find the corresponding closing tag and remove everything
|
|
48
|
+
let match;
|
|
49
|
+
while ((match = openTagRe.exec(result)) !== null) {
|
|
50
|
+
const startIdx = match.index;
|
|
51
|
+
// Simple depth-based closing tag finder
|
|
52
|
+
const closeTag = `</${tag}>`;
|
|
53
|
+
let depth = 1;
|
|
54
|
+
let searchPos = startIdx + match[0].length;
|
|
55
|
+
const openRe = new RegExp(`<${tag}[\\s>]`, "gi");
|
|
56
|
+
const closeRe = new RegExp(`</${tag}>`, "gi");
|
|
57
|
+
let endIdx = -1;
|
|
58
|
+
while (depth > 0 && searchPos < result.length) {
|
|
59
|
+
openRe.lastIndex = searchPos;
|
|
60
|
+
closeRe.lastIndex = searchPos;
|
|
61
|
+
const nextOpen = openRe.exec(result);
|
|
62
|
+
const nextClose = closeRe.exec(result);
|
|
63
|
+
if (!nextClose)
|
|
64
|
+
break; // malformed HTML, bail
|
|
65
|
+
if (nextOpen && nextOpen.index < nextClose.index) {
|
|
66
|
+
depth++;
|
|
67
|
+
searchPos = nextOpen.index + nextOpen[0].length;
|
|
68
|
+
}
|
|
69
|
+
else {
|
|
70
|
+
depth--;
|
|
71
|
+
if (depth === 0) {
|
|
72
|
+
endIdx = nextClose.index + closeTag.length;
|
|
73
|
+
}
|
|
74
|
+
searchPos = nextClose.index + nextClose[0].length;
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
if (endIdx !== -1) {
|
|
78
|
+
result = result.slice(0, startIdx) + result.slice(endIdx);
|
|
79
|
+
openTagRe.lastIndex = startIdx; // re-scan from same position
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
// 3. Strip hidden elements
|
|
84
|
+
result = result.replace(/<[^>]+style\s*=\s*["'][^"']*display\s*:\s*none[^"']*["'][^>]*>[\s\S]*?<\/[^>]+>/gi, "");
|
|
85
|
+
result = result.replace(/<[^>]+style\s*=\s*["'][^"']*visibility\s*:\s*hidden[^"']*["'][^>]*>[\s\S]*?<\/[^>]+>/gi, "");
|
|
86
|
+
result = result.replace(/<[^>]+aria-hidden\s*=\s*["']true["'][^>]*>[\s\S]*?<\/[^>]+>/gi, "");
|
|
87
|
+
// 4. Strip empty divs and spans (only whitespace content)
|
|
88
|
+
result = result.replace(/<div[^>]*>\s*<\/div>/gi, "");
|
|
89
|
+
result = result.replace(/<span[^>]*>\s*<\/span>/gi, "");
|
|
90
|
+
return result;
|
|
91
|
+
}
|
|
92
|
+
export function htmlToMarkdown(html) {
|
|
93
|
+
// Step 1: Strip noise elements before conversion
|
|
94
|
+
const cleaned = stripNoiseElements(html);
|
|
95
|
+
let md = cleaned
|
|
96
|
+
.replace(/<script[\s\S]*?<\/script>/gi, "")
|
|
97
|
+
.replace(/<style[\s\S]*?<\/style>/gi, "")
|
|
98
|
+
.replace(/<noscript[\s\S]*?<\/noscript>/gi, "")
|
|
99
|
+
.replace(/<br\s*\/?>/gi, "\n")
|
|
100
|
+
.replace(/<\/p>/gi, "\n\n")
|
|
101
|
+
.replace(/<\/h[1-6]>/gi, "\n\n")
|
|
102
|
+
.replace(/<\/li>/gi, "\n")
|
|
103
|
+
.replace(/<li[^>]*>/gi, "- ")
|
|
104
|
+
.replace(/<h([1-6])[^>]*>/gi, (_, n) => "#".repeat(Number(n)) + " ")
|
|
105
|
+
.replace(/<a[^>]+href=["']([^"']+)["'][^>]*>([^<]*)<\/a>/gi, (_, href, text) => {
|
|
106
|
+
const decoded = decodeHtmlEntities(href);
|
|
107
|
+
if (decoded.startsWith("data:") || decoded.startsWith("javascript:"))
|
|
108
|
+
return text;
|
|
109
|
+
return `[${text}](${decoded})`;
|
|
110
|
+
})
|
|
111
|
+
.replace(/<[^>]+>/g, "")
|
|
112
|
+
.replace(/&/g, "&")
|
|
113
|
+
.replace(/</g, "<")
|
|
114
|
+
.replace(/>/g, ">")
|
|
115
|
+
.replace(/"/g, '"')
|
|
116
|
+
.replace(/'/g, "'")
|
|
117
|
+
.replace(/ /g, " ");
|
|
118
|
+
// Step 2: Post-conversion cleanup
|
|
119
|
+
// Collapse 3+ consecutive newlines to 2
|
|
120
|
+
md = md.replace(/\n{3,}/g, "\n\n");
|
|
121
|
+
// Remove lines that are only dashes or underscores (visual separators)
|
|
122
|
+
md = md.replace(/^\s*[-_]{3,}\s*$/gm, "");
|
|
123
|
+
// Trim trailing whitespace per line
|
|
124
|
+
md = md.replace(/[^\S\n]+$/gm, "");
|
|
125
|
+
// Final collapse after separator removal
|
|
126
|
+
md = md.replace(/\n{3,}/g, "\n\n");
|
|
127
|
+
return md.trim();
|
|
128
|
+
}
|
|
129
|
+
/**
|
|
130
|
+
* Count rough number of HTML tags in a string.
|
|
131
|
+
*/
|
|
132
|
+
export function countHtmlTags(html) {
|
|
133
|
+
const matches = html.match(/<[a-zA-Z][^>]*>/g);
|
|
134
|
+
return matches ? matches.length : 0;
|
|
135
|
+
}
|
|
136
|
+
/**
|
|
137
|
+
* Compute content density score: ratio of text content to total content + tag overhead.
|
|
138
|
+
* Higher = cleaner content. Range: 0.0 to 1.0.
|
|
139
|
+
*/
|
|
140
|
+
export function contentDensity(markdownLength, tagCount) {
|
|
141
|
+
if (markdownLength === 0 && tagCount === 0)
|
|
142
|
+
return 0;
|
|
143
|
+
return parseFloat((markdownLength / (markdownLength + tagCount * 10)).toFixed(2));
|
|
144
|
+
}
|
|
145
|
+
export function htmlToText(html) {
|
|
146
|
+
return htmlToMarkdown(html)
|
|
147
|
+
.replace(/\[([^\]]+)\]\([^)]+\)/g, "$1") // strip link URLs, keep text
|
|
148
|
+
.replace(/#+\s/g, "") // strip heading markers
|
|
149
|
+
.replace(/^-\s/gm, "") // strip list bullets
|
|
150
|
+
.trim();
|
|
151
|
+
}
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
// Proxy username injection prevention — no hyphens allowed
|
|
2
|
+
// (providers use `-` as segment delimiter in auth strings)
|
|
3
|
+
export const SAFE_COUNTRY = /^[a-zA-Z0-9_]+$/;
|
|
4
|
+
export const SAFE_CITY = /^[a-zA-Z0-9_]+$/;
|
|
5
|
+
export const SAFE_SESSION_ID = /^[a-zA-Z0-9_]+$/;
|
|
6
|
+
export const QUOTA_NOTE = "Check dashboard.novada.com for real-time balance";
|
package/package.json
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "novada-proxy-core",
|
|
3
|
+
"version": "0.0.1",
|
|
4
|
+
"description": "Core proxy engine \u2014 adapters, tools, types for Novada Proxy",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"main": "build/index.js",
|
|
7
|
+
"types": "build/index.d.ts",
|
|
8
|
+
"exports": {
|
|
9
|
+
".": {
|
|
10
|
+
"types": "./build/index.d.ts",
|
|
11
|
+
"import": "./build/index.js"
|
|
12
|
+
},
|
|
13
|
+
"./tools": {
|
|
14
|
+
"types": "./build/tools/index.d.ts",
|
|
15
|
+
"import": "./build/tools/index.js"
|
|
16
|
+
},
|
|
17
|
+
"./adapters": {
|
|
18
|
+
"types": "./build/adapters/index.d.ts",
|
|
19
|
+
"import": "./build/adapters/index.js"
|
|
20
|
+
},
|
|
21
|
+
"./errors": {
|
|
22
|
+
"types": "./build/errors.d.ts",
|
|
23
|
+
"import": "./build/errors.js"
|
|
24
|
+
}
|
|
25
|
+
},
|
|
26
|
+
"files": [
|
|
27
|
+
"build/**/*.js",
|
|
28
|
+
"build/**/*.d.ts",
|
|
29
|
+
"README.md"
|
|
30
|
+
],
|
|
31
|
+
"scripts": {
|
|
32
|
+
"build": "tsc",
|
|
33
|
+
"test": "vitest run"
|
|
34
|
+
},
|
|
35
|
+
"dependencies": {
|
|
36
|
+
"axios": "^1.7.0",
|
|
37
|
+
"http-proxy-agent": "^7.0.0",
|
|
38
|
+
"https-proxy-agent": "^9.0.0",
|
|
39
|
+
"puppeteer-core": "^22.15.0"
|
|
40
|
+
},
|
|
41
|
+
"devDependencies": {
|
|
42
|
+
"@types/node": "^20.11.24",
|
|
43
|
+
"typescript": "^5.3.3",
|
|
44
|
+
"vitest": "^4.1.4"
|
|
45
|
+
},
|
|
46
|
+
"engines": {
|
|
47
|
+
"node": ">=18.0.0"
|
|
48
|
+
},
|
|
49
|
+
"license": "MIT"
|
|
50
|
+
}
|