@vespermcp/mcp-server 1.2.22 → 1.2.24

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,297 @@
1
+ const WHITELIST = [
2
+ "sec.gov",
3
+ "www.sec.gov",
4
+ "wikipedia.org",
5
+ "en.wikipedia.org",
6
+ "producthunt.com",
7
+ "www.producthunt.com",
8
+ "crunchbase.com",
9
+ "www.crunchbase.com",
10
+ ];
11
+ function stripTags(input) {
12
+ return input.replace(/<[^>]*>/g, " ").replace(/\s+/g, " ").trim();
13
+ }
14
+ function decodeHtml(input) {
15
+ return String(input || "")
16
+ .replace(/&nbsp;/g, " ")
17
+ .replace(/&amp;/g, "&")
18
+ .replace(/&lt;/g, "<")
19
+ .replace(/&gt;/g, ">")
20
+ .replace(/&quot;/g, "\"")
21
+ .replace(/&#39;/g, "'");
22
+ }
23
+ function extractTables(html) {
24
+ const tables = [];
25
+ const tableMatches = html.match(/<table[\s\S]*?<\/table>/gi) || [];
26
+ for (const table of tableMatches) {
27
+ const rows = [];
28
+ const rowMatches = table.match(/<tr[\s\S]*?<\/tr>/gi) || [];
29
+ for (const row of rowMatches) {
30
+ const cells = row.match(/<(td|th)[\s\S]*?<\/(td|th)>/gi) || [];
31
+ const line = cells.map(c => decodeHtml(stripTags(c))).filter(Boolean).join(" | ");
32
+ if (line)
33
+ rows.push(line);
34
+ }
35
+ if (rows.length > 0)
36
+ tables.push(rows);
37
+ }
38
+ return tables;
39
+ }
40
+ function extractLists(html) {
41
+ const out = [];
42
+ const listBlocks = html.match(/<(ul|ol)[\s\S]*?<\/(ul|ol)>/gi) || [];
43
+ for (const block of listBlocks) {
44
+ const items = block.match(/<li[\s\S]*?<\/li>/gi) || [];
45
+ for (const item of items) {
46
+ const text = decodeHtml(stripTags(item));
47
+ if (text)
48
+ out.push(text);
49
+ }
50
+ }
51
+ return out;
52
+ }
53
+ function extractInfobox(html) {
54
+ const box = html.match(/<table[^>]*class\s*=\s*["'][^"']*infobox[^"']*["'][\s\S]*?<\/table>/i)?.[0]
55
+ || html.match(/<table[^>]*class=[^>\s]*infobox[^>\s]*[\s\S]*?<\/table>/i)?.[0];
56
+ if (!box)
57
+ return {};
58
+ const rows = box.match(/<tr[\s\S]*?<\/tr>/gi) || [];
59
+ const info = {};
60
+ for (const row of rows) {
61
+ const th = row.match(/<th[\s\S]*?<\/th>/i)?.[0];
62
+ const td = row.match(/<td[\s\S]*?<\/td>/i)?.[0];
63
+ const k = th ? decodeHtml(stripTags(th)) : "";
64
+ const v = td ? decodeHtml(stripTags(td)) : "";
65
+ if (k && v)
66
+ info[k] = v;
67
+ }
68
+ return info;
69
+ }
70
+ function validateSchema(data, schema) {
71
+ const required = schema?.required_fields || [];
72
+ const missing = required.filter((f) => data[f] === undefined || data[f] === null || data[f] === "");
73
+ return { ok: missing.length === 0, missing };
74
+ }
75
+ function toIsoDateLoose(v) {
76
+ const s = String(v || "").trim();
77
+ if (!s)
78
+ return null;
79
+ const d = new Date(s);
80
+ if (Number.isNaN(d.getTime()))
81
+ return null;
82
+ return d.toISOString().slice(0, 10);
83
+ }
84
+ function parseUsdLoose(v) {
85
+ const s = String(v || "").trim().replace(/,/g, "");
86
+ if (!s)
87
+ return null;
88
+ const m = s.match(/\$?\s*([\d.]+)\s*([KMB])?/i);
89
+ if (!m)
90
+ return null;
91
+ const n = Number(m[1]);
92
+ if (!Number.isFinite(n))
93
+ return null;
94
+ const suffix = (m[2] || "").toUpperCase();
95
+ const mul = suffix === "K" ? 1_000 : suffix === "M" ? 1_000_000 : suffix === "B" ? 1_000_000_000 : 1;
96
+ return Math.round(n * mul);
97
+ }
98
+ function applyDomainTransforms(domain, data) {
99
+ const out = { ...data };
100
+ if (domain.endsWith("sec.gov")) {
101
+ if (out.infobox && typeof out.infobox === "object") {
102
+ const normalized = { ...out.infobox };
103
+ for (const [k, v] of Object.entries(normalized)) {
104
+ const date = toIsoDateLoose(String(v));
105
+ if (date && /(date|filed|accepted|period|fiscal)/i.test(k)) {
106
+ normalized[`${k}_iso`] = date;
107
+ }
108
+ }
109
+ out.infobox = normalized;
110
+ }
111
+ if (Array.isArray(out.items)) {
112
+ out.items = out.items.map((line) => {
113
+ const date = toIsoDateLoose(line);
114
+ return date ? `${line} [date_iso=${date}]` : line;
115
+ });
116
+ }
117
+ }
118
+ if (domain.endsWith("crunchbase.com")) {
119
+ if (out.infobox && typeof out.infobox === "object") {
120
+ const normalized = { ...out.infobox };
121
+ for (const [k, v] of Object.entries(normalized)) {
122
+ const usd = parseUsdLoose(String(v));
123
+ if (usd !== null && /(fund|raised|valuation|revenue|price|amount|usd|\$)/i.test(k)) {
124
+ normalized[`${k}_usd`] = usd;
125
+ }
126
+ }
127
+ out.infobox = normalized;
128
+ }
129
+ if (Array.isArray(out.items)) {
130
+ out.items = out.items.map((line) => {
131
+ const usd = parseUsdLoose(line);
132
+ return usd !== null ? `${line} [usd=${usd}]` : line;
133
+ });
134
+ }
135
+ }
136
+ return out;
137
+ }
138
+ function strictSchemaForDomain(domain, contentType) {
139
+ // P1 strict schema contracts (minimal but explicit per domain)
140
+ if (domain.endsWith("sec.gov")) {
141
+ return contentType === "table" ? ["tables"] : ["items"];
142
+ }
143
+ if (domain.endsWith("crunchbase.com")) {
144
+ return contentType === "infobox" ? ["infobox"] : ["items"];
145
+ }
146
+ if (domain.endsWith("wikipedia.org")) {
147
+ return contentType === "infobox" ? ["infobox"] : ["tables"];
148
+ }
149
+ if (domain.endsWith("producthunt.com")) {
150
+ return ["items"];
151
+ }
152
+ return [];
153
+ }
154
+ function toError(code, message, details) {
155
+ return { code, message, details };
156
+ }
157
+ export class WebExtractorEngine {
158
+ cache;
159
+ constructor(cache) {
160
+ this.cache = cache;
161
+ }
162
+ async extract(input) {
163
+ let parsed;
164
+ try {
165
+ parsed = new URL(String(input.url || "").trim());
166
+ }
167
+ catch {
168
+ return {
169
+ ok: false,
170
+ url: String(input.url || ""),
171
+ domain: "",
172
+ mode: input.mode || "auto",
173
+ extracted_at: new Date().toISOString(),
174
+ content_type: "list",
175
+ data: {},
176
+ error: toError("INVALID_URL", "Invalid URL"),
177
+ cache: { hit: false, fallback_used: false, max_age_ms: 21600_000 },
178
+ };
179
+ }
180
+ const domain = parsed.hostname.toLowerCase();
181
+ if (!WHITELIST.some(d => domain === d || domain.endsWith(`.${d}`))) {
182
+ return {
183
+ ok: false,
184
+ url: parsed.toString(),
185
+ domain,
186
+ mode: input.mode || "auto",
187
+ extracted_at: new Date().toISOString(),
188
+ content_type: "list",
189
+ data: {},
190
+ error: toError("DOMAIN_NOT_ALLOWED", `Domain not allowed: ${domain}`, { allowed: WHITELIST }),
191
+ cache: { hit: false, fallback_used: false, max_age_ms: 21600_000 },
192
+ };
193
+ }
194
+ const mode = input.mode || "auto";
195
+ const strict = input.strict_schema !== false;
196
+ const cacheKey = `webextract:${domain}:${parsed.pathname}?${parsed.search}:mode=${mode}`;
197
+ const cached = await this.cache?.getJson(cacheKey);
198
+ const maxAgeMs = 21600_000;
199
+ const cacheAgeMs = cached?.extracted_at ? Math.max(0, Date.now() - new Date(cached.extracted_at).getTime()) : undefined;
200
+ try {
201
+ const res = await fetch(parsed.toString(), {
202
+ headers: {
203
+ "User-Agent": "vesper/2.0 (phase3-extract-web)",
204
+ },
205
+ });
206
+ if (!res.ok) {
207
+ throw toError("HTTP_ERROR", `HTTP ${res.status}`, { status: res.status });
208
+ }
209
+ const html = await res.text();
210
+ let content_type = "list";
211
+ let data = {};
212
+ const tables = extractTables(html);
213
+ const lists = extractLists(html);
214
+ const infobox = extractInfobox(html);
215
+ if (mode === "table" || (mode === "auto" && tables.length > 0)) {
216
+ content_type = "table";
217
+ data = { tables };
218
+ }
219
+ else if (mode === "infobox" || (mode === "auto" && Object.keys(infobox).length > 0)) {
220
+ content_type = "infobox";
221
+ data = { infobox };
222
+ }
223
+ else {
224
+ content_type = "list";
225
+ data = { items: lists };
226
+ }
227
+ if ((content_type === "table" && (!data.tables || data.tables.length === 0)) ||
228
+ (content_type === "list" && (!data.items || data.items.length === 0)) ||
229
+ (content_type === "infobox" && (!data.infobox || Object.keys(data.infobox).length === 0))) {
230
+ throw toError("EXTRACTION_EMPTY", "No structured content extracted from static HTML; possible dynamic page or site layout change.");
231
+ }
232
+ data = applyDomainTransforms(domain, data);
233
+ const mergedSchemaRequired = [
234
+ ...(strict ? strictSchemaForDomain(domain, content_type) : []),
235
+ ...(input.schema?.required_fields || []),
236
+ ];
237
+ const schemaCheck = validateSchema(data, { required_fields: mergedSchemaRequired });
238
+ if (!schemaCheck.ok) {
239
+ throw toError("SCHEMA_VALIDATION_FAILED", "Schema validation failed", {
240
+ missing: schemaCheck.missing,
241
+ required_fields: mergedSchemaRequired,
242
+ strict_domain_schema: strict,
243
+ });
244
+ }
245
+ const out = {
246
+ ok: true,
247
+ url: parsed.toString(),
248
+ domain,
249
+ mode,
250
+ extracted_at: new Date().toISOString(),
251
+ content_type,
252
+ data,
253
+ cache: {
254
+ hit: false,
255
+ fallback_used: false,
256
+ max_age_ms: maxAgeMs,
257
+ },
258
+ };
259
+ await this.cache?.setJson(cacheKey, out, 21600); // 6h fallback cache
260
+ return out;
261
+ }
262
+ catch (e) {
263
+ if (cached) {
264
+ return {
265
+ ...cached,
266
+ ok: true,
267
+ from_cache: true,
268
+ warning: `Live extraction failed, served cached content. Reason: ${e?.message || String(e)}`,
269
+ cache: {
270
+ hit: true,
271
+ fallback_used: true,
272
+ age_ms: cacheAgeMs,
273
+ max_age_ms: maxAgeMs,
274
+ },
275
+ error: toError("LIVE_EXTRACTION_FAILED", e?.message || String(e)),
276
+ };
277
+ }
278
+ return {
279
+ ok: false,
280
+ url: parsed.toString(),
281
+ domain,
282
+ mode,
283
+ extracted_at: new Date().toISOString(),
284
+ content_type: "list",
285
+ data: {},
286
+ cache: {
287
+ hit: false,
288
+ fallback_used: false,
289
+ max_age_ms: maxAgeMs,
290
+ },
291
+ error: e?.code
292
+ ? e
293
+ : toError("LIVE_EXTRACTION_FAILED", e?.message || String(e)),
294
+ };
295
+ }
296
+ }
297
+ }