webpeel 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +415 -0
- package/dist/cli.d.ts +16 -0
- package/dist/cli.d.ts.map +1 -0
- package/dist/cli.js +140 -0
- package/dist/cli.js.map +1 -0
- package/dist/core/fetcher.d.ts +32 -0
- package/dist/core/fetcher.d.ts.map +1 -0
- package/dist/core/fetcher.js +479 -0
- package/dist/core/fetcher.js.map +1 -0
- package/dist/core/markdown.d.ts +17 -0
- package/dist/core/markdown.d.ts.map +1 -0
- package/dist/core/markdown.js +143 -0
- package/dist/core/markdown.js.map +1 -0
- package/dist/core/metadata.d.ts +17 -0
- package/dist/core/metadata.d.ts.map +1 -0
- package/dist/core/metadata.js +159 -0
- package/dist/core/metadata.js.map +1 -0
- package/dist/core/strategies.d.ts +30 -0
- package/dist/core/strategies.d.ts.map +1 -0
- package/dist/core/strategies.js +67 -0
- package/dist/core/strategies.js.map +1 -0
- package/dist/index.d.ts +31 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +81 -0
- package/dist/index.js.map +1 -0
- package/dist/mcp/server.d.ts +7 -0
- package/dist/mcp/server.d.ts.map +1 -0
- package/dist/mcp/server.js +248 -0
- package/dist/mcp/server.js.map +1 -0
- package/dist/server/app.d.ts +13 -0
- package/dist/server/app.d.ts.map +1 -0
- package/dist/server/app.js +89 -0
- package/dist/server/app.js.map +1 -0
- package/dist/server/auth-store.d.ts +28 -0
- package/dist/server/auth-store.d.ts.map +1 -0
- package/dist/server/auth-store.js +87 -0
- package/dist/server/auth-store.js.map +1 -0
- package/dist/server/middleware/auth.d.ts +18 -0
- package/dist/server/middleware/auth.d.ts.map +1 -0
- package/dist/server/middleware/auth.js +55 -0
- package/dist/server/middleware/auth.js.map +1 -0
- package/dist/server/middleware/rate-limit.d.ts +23 -0
- package/dist/server/middleware/rate-limit.d.ts.map +1 -0
- package/dist/server/middleware/rate-limit.js +85 -0
- package/dist/server/middleware/rate-limit.js.map +1 -0
- package/dist/server/routes/fetch.d.ts +7 -0
- package/dist/server/routes/fetch.d.ts.map +1 -0
- package/dist/server/routes/fetch.js +127 -0
- package/dist/server/routes/fetch.js.map +1 -0
- package/dist/server/routes/health.d.ts +6 -0
- package/dist/server/routes/health.d.ts.map +1 -0
- package/dist/server/routes/health.js +19 -0
- package/dist/server/routes/health.js.map +1 -0
- package/dist/server/routes/search.d.ts +7 -0
- package/dist/server/routes/search.d.ts.map +1 -0
- package/dist/server/routes/search.js +124 -0
- package/dist/server/routes/search.js.map +1 -0
- package/dist/types.d.ts +59 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +30 -0
- package/dist/types.js.map +1 -0
- package/llms.txt +60 -0
- package/package.json +80 -0
|
@@ -0,0 +1,479 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Core fetching logic: simple HTTP and browser-based fetching
|
|
3
|
+
*/
|
|
4
|
+
import { chromium } from 'playwright';
|
|
5
|
+
import { TimeoutError, BlockedError, NetworkError, WebPeelError } from '../types.js';
|
|
6
|
+
const USER_AGENTS = [
|
|
7
|
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
|
|
8
|
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
|
|
9
|
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.0 Safari/605.1.15',
|
|
10
|
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0',
|
|
11
|
+
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
|
|
12
|
+
];
|
|
13
|
+
function getRandomUserAgent() {
|
|
14
|
+
return USER_AGENTS[Math.floor(Math.random() * USER_AGENTS.length)];
|
|
15
|
+
}
|
|
16
|
+
/**
|
|
17
|
+
* SECURITY: Validate URL to prevent SSRF attacks
|
|
18
|
+
* Blocks localhost, private IPs, link-local, and various bypass techniques
|
|
19
|
+
*/
|
|
20
|
+
function validateUrl(urlString) {
|
|
21
|
+
// Length check
|
|
22
|
+
if (urlString.length > 2048) {
|
|
23
|
+
throw new WebPeelError('URL too long (max 2048 characters)');
|
|
24
|
+
}
|
|
25
|
+
// Check for control characters and suspicious encoding
|
|
26
|
+
if (/[\x00-\x1F\x7F]/.test(urlString)) {
|
|
27
|
+
throw new WebPeelError('URL contains invalid control characters');
|
|
28
|
+
}
|
|
29
|
+
let url;
|
|
30
|
+
try {
|
|
31
|
+
url = new URL(urlString);
|
|
32
|
+
}
|
|
33
|
+
catch {
|
|
34
|
+
throw new WebPeelError('Invalid URL format');
|
|
35
|
+
}
|
|
36
|
+
// Only allow HTTP(S)
|
|
37
|
+
if (!['http:', 'https:'].includes(url.protocol)) {
|
|
38
|
+
throw new WebPeelError('Only HTTP and HTTPS protocols are allowed');
|
|
39
|
+
}
|
|
40
|
+
// Validate hostname is not empty
|
|
41
|
+
if (!url.hostname) {
|
|
42
|
+
throw new WebPeelError('Invalid hostname');
|
|
43
|
+
}
|
|
44
|
+
const hostname = url.hostname.toLowerCase();
|
|
45
|
+
// Block localhost patterns
|
|
46
|
+
const localhostPatterns = ['localhost', '0.0.0.0'];
|
|
47
|
+
if (localhostPatterns.some(pattern => hostname === pattern || hostname.endsWith('.' + pattern))) {
|
|
48
|
+
throw new WebPeelError('Access to localhost is not allowed');
|
|
49
|
+
}
|
|
50
|
+
// ENHANCED: Parse and validate IP addresses (handles hex, octal, decimal, mixed)
|
|
51
|
+
const ipv4Info = parseAndValidateIPv4(hostname);
|
|
52
|
+
if (ipv4Info) {
|
|
53
|
+
validateIPv4Address(ipv4Info);
|
|
54
|
+
}
|
|
55
|
+
// ENHANCED: Comprehensive IPv6 validation
|
|
56
|
+
if (hostname.includes(':')) {
|
|
57
|
+
validateIPv6Address(hostname);
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
/**
|
|
61
|
+
* Parse IPv4 address in any format (dotted, hex, octal, decimal, mixed)
|
|
62
|
+
* Returns null if not an IPv4 address
|
|
63
|
+
*/
|
|
64
|
+
function parseAndValidateIPv4(hostname) {
|
|
65
|
+
// Remove brackets if present
|
|
66
|
+
const cleaned = hostname.replace(/^\[|\]$/g, '');
|
|
67
|
+
// Standard dotted notation: 192.168.1.1
|
|
68
|
+
const dottedRegex = /^(\d{1,3})\.(\d{1,3})\.(\d{1,3})\.(\d{1,3})$/;
|
|
69
|
+
const dottedMatch = cleaned.match(dottedRegex);
|
|
70
|
+
if (dottedMatch) {
|
|
71
|
+
const octets = dottedMatch.slice(1).map(Number);
|
|
72
|
+
if (octets.every(o => o >= 0 && o <= 255)) {
|
|
73
|
+
return octets;
|
|
74
|
+
}
|
|
75
|
+
throw new WebPeelError('Invalid IPv4 address');
|
|
76
|
+
}
|
|
77
|
+
// Hex notation: 0x7f000001
|
|
78
|
+
if (/^0x[0-9a-fA-F]+$/.test(cleaned)) {
|
|
79
|
+
const num = parseInt(cleaned, 16);
|
|
80
|
+
return [
|
|
81
|
+
(num >>> 24) & 0xff,
|
|
82
|
+
(num >>> 16) & 0xff,
|
|
83
|
+
(num >>> 8) & 0xff,
|
|
84
|
+
num & 0xff,
|
|
85
|
+
];
|
|
86
|
+
}
|
|
87
|
+
// Octal notation: 0177.0.0.1 or full octal 017700000001
|
|
88
|
+
if (/^0[0-7]/.test(cleaned)) {
|
|
89
|
+
// Full octal (all digits)
|
|
90
|
+
if (/^0[0-7]+$/.test(cleaned)) {
|
|
91
|
+
const num = parseInt(cleaned, 8);
|
|
92
|
+
if (num <= 0xffffffff) {
|
|
93
|
+
return [
|
|
94
|
+
(num >>> 24) & 0xff,
|
|
95
|
+
(num >>> 16) & 0xff,
|
|
96
|
+
(num >>> 8) & 0xff,
|
|
97
|
+
num & 0xff,
|
|
98
|
+
];
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
// Mixed octal-decimal: 0177.0.0.1
|
|
102
|
+
const parts = cleaned.split('.');
|
|
103
|
+
if (parts.length === 4) {
|
|
104
|
+
const octets = parts.map(p => parseInt(p, /^0[0-7]/.test(p) ? 8 : 10));
|
|
105
|
+
if (octets.every(o => o >= 0 && o <= 255)) {
|
|
106
|
+
return octets;
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
// Decimal notation: 2130706433
|
|
111
|
+
if (/^\d+$/.test(cleaned)) {
|
|
112
|
+
const num = parseInt(cleaned, 10);
|
|
113
|
+
if (num <= 0xffffffff) {
|
|
114
|
+
return [
|
|
115
|
+
(num >>> 24) & 0xff,
|
|
116
|
+
(num >>> 16) & 0xff,
|
|
117
|
+
(num >>> 8) & 0xff,
|
|
118
|
+
num & 0xff,
|
|
119
|
+
];
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
return null;
|
|
123
|
+
}
|
|
124
|
+
/**
|
|
125
|
+
* Validate IPv4 address against private/reserved ranges
|
|
126
|
+
*/
|
|
127
|
+
function validateIPv4Address(octets) {
|
|
128
|
+
const [a, b, c, d] = octets;
|
|
129
|
+
// Loopback: 127.0.0.0/8
|
|
130
|
+
if (a === 127) {
|
|
131
|
+
throw new WebPeelError('Access to loopback addresses is not allowed');
|
|
132
|
+
}
|
|
133
|
+
// Private: 10.0.0.0/8
|
|
134
|
+
if (a === 10) {
|
|
135
|
+
throw new WebPeelError('Access to private IP addresses is not allowed');
|
|
136
|
+
}
|
|
137
|
+
// Private: 172.16.0.0/12
|
|
138
|
+
if (a === 172 && b >= 16 && b <= 31) {
|
|
139
|
+
throw new WebPeelError('Access to private IP addresses is not allowed');
|
|
140
|
+
}
|
|
141
|
+
// Private: 192.168.0.0/16
|
|
142
|
+
if (a === 192 && b === 168) {
|
|
143
|
+
throw new WebPeelError('Access to private IP addresses is not allowed');
|
|
144
|
+
}
|
|
145
|
+
// Link-local: 169.254.0.0/16
|
|
146
|
+
if (a === 169 && b === 254) {
|
|
147
|
+
throw new WebPeelError('Access to link-local addresses is not allowed');
|
|
148
|
+
}
|
|
149
|
+
// Broadcast: 255.255.255.255
|
|
150
|
+
if (a === 255 && b === 255 && c === 255 && d === 255) {
|
|
151
|
+
throw new WebPeelError('Access to broadcast address is not allowed');
|
|
152
|
+
}
|
|
153
|
+
// This network: 0.0.0.0/8
|
|
154
|
+
if (a === 0) {
|
|
155
|
+
throw new WebPeelError('Access to "this network" addresses is not allowed');
|
|
156
|
+
}
|
|
157
|
+
}
|
|
158
|
+
/**
|
|
159
|
+
* Validate IPv6 address against private/reserved ranges
|
|
160
|
+
*/
|
|
161
|
+
function validateIPv6Address(hostname) {
|
|
162
|
+
// Remove brackets
|
|
163
|
+
const addr = hostname.replace(/^\[|\]$/g, '').toLowerCase();
|
|
164
|
+
// Loopback: ::1
|
|
165
|
+
if (addr === '::1' || addr === '0:0:0:0:0:0:0:1') {
|
|
166
|
+
throw new WebPeelError('Access to loopback addresses is not allowed');
|
|
167
|
+
}
|
|
168
|
+
// IPv6 mapped IPv4: ::ffff:192.168.1.1 or ::ffff:c0a8:0101
|
|
169
|
+
if (addr.startsWith('::ffff:')) {
|
|
170
|
+
// Extract the IPv4 part
|
|
171
|
+
const ipv4Part = addr.substring(7);
|
|
172
|
+
// Could be dotted (::ffff:192.168.1.1) or hex (::ffff:c0a8:0101)
|
|
173
|
+
if (ipv4Part.includes('.')) {
|
|
174
|
+
// Parse dotted IPv4
|
|
175
|
+
const parts = ipv4Part.split('.');
|
|
176
|
+
if (parts.length === 4) {
|
|
177
|
+
const octets = parts.map(p => parseInt(p, 10));
|
|
178
|
+
if (octets.every(o => !isNaN(o) && o >= 0 && o <= 255)) {
|
|
179
|
+
validateIPv4Address(octets);
|
|
180
|
+
}
|
|
181
|
+
}
|
|
182
|
+
}
|
|
183
|
+
else {
|
|
184
|
+
// Parse hex IPv4 (e.g., c0a80101 = 192.168.1.1)
|
|
185
|
+
const hexStr = ipv4Part.replace(/:/g, '');
|
|
186
|
+
if (/^[0-9a-f]{1,8}$/.test(hexStr)) {
|
|
187
|
+
const num = parseInt(hexStr, 16);
|
|
188
|
+
const octets = [
|
|
189
|
+
(num >>> 24) & 0xff,
|
|
190
|
+
(num >>> 16) & 0xff,
|
|
191
|
+
(num >>> 8) & 0xff,
|
|
192
|
+
num & 0xff,
|
|
193
|
+
];
|
|
194
|
+
validateIPv4Address(octets);
|
|
195
|
+
}
|
|
196
|
+
}
|
|
197
|
+
throw new WebPeelError('Access to IPv6-mapped IPv4 addresses is not allowed');
|
|
198
|
+
}
|
|
199
|
+
// Unique local addresses: fc00::/7 (fc00:: to fdff::)
|
|
200
|
+
if (addr.startsWith('fc') || addr.startsWith('fd')) {
|
|
201
|
+
throw new WebPeelError('Access to unique local IPv6 addresses is not allowed');
|
|
202
|
+
}
|
|
203
|
+
// Link-local: fe80::/10
|
|
204
|
+
if (addr.startsWith('fe8') || addr.startsWith('fe9') ||
|
|
205
|
+
addr.startsWith('fea') || addr.startsWith('feb')) {
|
|
206
|
+
throw new WebPeelError('Access to link-local IPv6 addresses is not allowed');
|
|
207
|
+
}
|
|
208
|
+
}
|
|
209
|
+
/**
|
|
210
|
+
* Validate and sanitize user agent string
|
|
211
|
+
*/
|
|
212
|
+
function validateUserAgent(userAgent) {
|
|
213
|
+
if (userAgent.length > 500) {
|
|
214
|
+
throw new WebPeelError('User agent too long (max 500 characters)');
|
|
215
|
+
}
|
|
216
|
+
// Allow only printable ASCII characters
|
|
217
|
+
if (!/^[\x20-\x7E]*$/.test(userAgent)) {
|
|
218
|
+
throw new WebPeelError('User agent contains invalid characters');
|
|
219
|
+
}
|
|
220
|
+
return userAgent;
|
|
221
|
+
}
|
|
222
|
+
/**
|
|
223
|
+
* Simple HTTP fetch using native fetch + Cheerio
|
|
224
|
+
* Fast and lightweight, but can be blocked by Cloudflare/bot detection
|
|
225
|
+
* SECURITY: Manual redirect handling with SSRF re-validation
|
|
226
|
+
*/
|
|
227
|
+
export async function simpleFetch(url, userAgent, timeoutMs = 30000) {
|
|
228
|
+
// SECURITY: Validate URL to prevent SSRF
|
|
229
|
+
validateUrl(url);
|
|
230
|
+
// Validate user agent if provided
|
|
231
|
+
const validatedUserAgent = userAgent ? validateUserAgent(userAgent) : getRandomUserAgent();
|
|
232
|
+
const MAX_REDIRECTS = 10;
|
|
233
|
+
let redirectCount = 0;
|
|
234
|
+
let currentUrl = url;
|
|
235
|
+
const seenUrls = new Set();
|
|
236
|
+
while (redirectCount <= MAX_REDIRECTS) {
|
|
237
|
+
// Detect redirect loops
|
|
238
|
+
if (seenUrls.has(currentUrl)) {
|
|
239
|
+
throw new WebPeelError('Redirect loop detected');
|
|
240
|
+
}
|
|
241
|
+
seenUrls.add(currentUrl);
|
|
242
|
+
// Re-validate on each redirect
|
|
243
|
+
validateUrl(currentUrl);
|
|
244
|
+
const controller = new AbortController();
|
|
245
|
+
const timer = setTimeout(() => controller.abort(), timeoutMs);
|
|
246
|
+
try {
|
|
247
|
+
const response = await fetch(currentUrl, {
|
|
248
|
+
headers: {
|
|
249
|
+
'User-Agent': validatedUserAgent,
|
|
250
|
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
|
251
|
+
'Accept-Language': 'en-US,en;q=0.9',
|
|
252
|
+
'Accept-Encoding': 'gzip, deflate, br',
|
|
253
|
+
'DNT': '1',
|
|
254
|
+
'Connection': 'keep-alive',
|
|
255
|
+
'Upgrade-Insecure-Requests': '1',
|
|
256
|
+
},
|
|
257
|
+
signal: controller.signal,
|
|
258
|
+
redirect: 'manual', // SECURITY: Manual redirect handling
|
|
259
|
+
});
|
|
260
|
+
clearTimeout(timer);
|
|
261
|
+
// Handle redirects manually
|
|
262
|
+
if (response.status >= 300 && response.status < 400) {
|
|
263
|
+
const location = response.headers.get('location');
|
|
264
|
+
if (!location) {
|
|
265
|
+
throw new NetworkError('Redirect response missing Location header');
|
|
266
|
+
}
|
|
267
|
+
// Resolve relative URLs
|
|
268
|
+
currentUrl = new URL(location, currentUrl).href;
|
|
269
|
+
redirectCount++;
|
|
270
|
+
continue;
|
|
271
|
+
}
|
|
272
|
+
if (!response.ok) {
|
|
273
|
+
if (response.status === 403 || response.status === 503) {
|
|
274
|
+
throw new BlockedError(`HTTP ${response.status}: Site may be blocking requests. Try --render for browser mode.`);
|
|
275
|
+
}
|
|
276
|
+
throw new NetworkError(`HTTP ${response.status}: ${response.statusText}`);
|
|
277
|
+
}
|
|
278
|
+
// SECURITY: Validate Content-Type
|
|
279
|
+
const contentType = response.headers.get('content-type') || '';
|
|
280
|
+
if (!contentType.includes('text/html') && !contentType.includes('application/xhtml+xml')) {
|
|
281
|
+
throw new WebPeelError('Unsupported content type. Only HTML is supported.');
|
|
282
|
+
}
|
|
283
|
+
// SECURITY: Stream response with size limit (prevent memory exhaustion)
|
|
284
|
+
const chunks = [];
|
|
285
|
+
let totalSize = 0;
|
|
286
|
+
const MAX_SIZE = 10 * 1024 * 1024; // 10MB
|
|
287
|
+
const reader = response.body?.getReader();
|
|
288
|
+
if (!reader) {
|
|
289
|
+
throw new NetworkError('Response body is not readable');
|
|
290
|
+
}
|
|
291
|
+
try {
|
|
292
|
+
while (true) {
|
|
293
|
+
const { done, value } = await reader.read();
|
|
294
|
+
if (done)
|
|
295
|
+
break;
|
|
296
|
+
totalSize += value.length;
|
|
297
|
+
if (totalSize > MAX_SIZE) {
|
|
298
|
+
reader.cancel();
|
|
299
|
+
throw new WebPeelError('Response too large (max 10MB)');
|
|
300
|
+
}
|
|
301
|
+
chunks.push(value);
|
|
302
|
+
}
|
|
303
|
+
}
|
|
304
|
+
finally {
|
|
305
|
+
reader.releaseLock();
|
|
306
|
+
}
|
|
307
|
+
// Combine chunks
|
|
308
|
+
const combined = new Uint8Array(totalSize);
|
|
309
|
+
let offset = 0;
|
|
310
|
+
for (const chunk of chunks) {
|
|
311
|
+
combined.set(chunk, offset);
|
|
312
|
+
offset += chunk.length;
|
|
313
|
+
}
|
|
314
|
+
const html = new TextDecoder().decode(combined);
|
|
315
|
+
if (!html || html.length < 100) {
|
|
316
|
+
throw new BlockedError('Empty or suspiciously small response. Site may require JavaScript.');
|
|
317
|
+
}
|
|
318
|
+
// Check for Cloudflare challenge
|
|
319
|
+
if (html.includes('cf-browser-verification') || html.includes('Just a moment...')) {
|
|
320
|
+
throw new BlockedError('Cloudflare challenge detected. Try --render for browser mode.');
|
|
321
|
+
}
|
|
322
|
+
return {
|
|
323
|
+
html,
|
|
324
|
+
url: currentUrl,
|
|
325
|
+
statusCode: response.status,
|
|
326
|
+
};
|
|
327
|
+
}
|
|
328
|
+
catch (error) {
|
|
329
|
+
clearTimeout(timer);
|
|
330
|
+
if (error instanceof BlockedError || error instanceof NetworkError || error instanceof WebPeelError) {
|
|
331
|
+
throw error;
|
|
332
|
+
}
|
|
333
|
+
if (error instanceof Error && error.name === 'AbortError') {
|
|
334
|
+
throw new TimeoutError(`Request timed out after ${timeoutMs}ms`);
|
|
335
|
+
}
|
|
336
|
+
throw new NetworkError(`Failed to fetch: ${error instanceof Error ? error.message : 'Unknown error'}`);
|
|
337
|
+
}
|
|
338
|
+
}
|
|
339
|
+
throw new WebPeelError(`Too many redirects (max ${MAX_REDIRECTS})`);
|
|
340
|
+
}
|
|
341
|
+
let sharedBrowser = null;
|
|
342
|
+
let activePagesCount = 0;
|
|
343
|
+
const MAX_CONCURRENT_PAGES = 5;
|
|
344
|
+
async function getBrowser() {
|
|
345
|
+
// SECURITY: Check if browser is still connected and healthy
|
|
346
|
+
if (sharedBrowser) {
|
|
347
|
+
try {
|
|
348
|
+
if (sharedBrowser.isConnected()) {
|
|
349
|
+
return sharedBrowser;
|
|
350
|
+
}
|
|
351
|
+
}
|
|
352
|
+
catch {
|
|
353
|
+
// Browser is dead, recreate
|
|
354
|
+
sharedBrowser = null;
|
|
355
|
+
}
|
|
356
|
+
}
|
|
357
|
+
sharedBrowser = await chromium.launch({ headless: true });
|
|
358
|
+
return sharedBrowser;
|
|
359
|
+
}
|
|
360
|
+
/**
|
|
361
|
+
* Fetch using headless Chromium via Playwright
|
|
362
|
+
* Slower but can handle JavaScript-heavy sites and bypass some bot detection
|
|
363
|
+
*/
|
|
364
|
+
export async function browserFetch(url, options = {}) {
|
|
365
|
+
// SECURITY: Validate URL to prevent SSRF
|
|
366
|
+
validateUrl(url);
|
|
367
|
+
const { userAgent, waitMs = 0, timeoutMs = 30000 } = options;
|
|
368
|
+
// Validate user agent if provided
|
|
369
|
+
const validatedUserAgent = userAgent ? validateUserAgent(userAgent) : getRandomUserAgent();
|
|
370
|
+
// Validate wait time
|
|
371
|
+
if (waitMs < 0 || waitMs > 60000) {
|
|
372
|
+
throw new WebPeelError('Wait time must be between 0 and 60000ms');
|
|
373
|
+
}
|
|
374
|
+
// SECURITY: Limit concurrent browser pages with timeout
|
|
375
|
+
const queueStartTime = Date.now();
|
|
376
|
+
const QUEUE_TIMEOUT_MS = 30000; // 30 second max wait
|
|
377
|
+
while (activePagesCount >= MAX_CONCURRENT_PAGES) {
|
|
378
|
+
if (Date.now() - queueStartTime > QUEUE_TIMEOUT_MS) {
|
|
379
|
+
throw new TimeoutError('Browser page queue timeout - too many concurrent requests');
|
|
380
|
+
}
|
|
381
|
+
await new Promise(resolve => setTimeout(resolve, 100));
|
|
382
|
+
}
|
|
383
|
+
activePagesCount++;
|
|
384
|
+
let page = null;
|
|
385
|
+
try {
|
|
386
|
+
const browser = await getBrowser();
|
|
387
|
+
page = await browser.newPage({
|
|
388
|
+
userAgent: validatedUserAgent,
|
|
389
|
+
});
|
|
390
|
+
// Block images, fonts, and other heavy resources for speed
|
|
391
|
+
await page.route('**/*', (route) => {
|
|
392
|
+
const resourceType = route.request().resourceType();
|
|
393
|
+
if (['image', 'font', 'media', 'stylesheet'].includes(resourceType)) {
|
|
394
|
+
route.abort();
|
|
395
|
+
}
|
|
396
|
+
else {
|
|
397
|
+
route.continue();
|
|
398
|
+
}
|
|
399
|
+
});
|
|
400
|
+
// SECURITY: Wrap entire operation in timeout
|
|
401
|
+
const fetchPromise = (async () => {
|
|
402
|
+
await page.goto(url, {
|
|
403
|
+
waitUntil: 'domcontentloaded',
|
|
404
|
+
timeout: timeoutMs,
|
|
405
|
+
});
|
|
406
|
+
// Wait for additional time if requested (for dynamic content)
|
|
407
|
+
if (waitMs > 0) {
|
|
408
|
+
await page.waitForTimeout(waitMs);
|
|
409
|
+
}
|
|
410
|
+
const html = await page.content();
|
|
411
|
+
const finalUrl = page.url();
|
|
412
|
+
return { html, finalUrl };
|
|
413
|
+
})();
|
|
414
|
+
const timeoutPromise = new Promise((_, reject) => {
|
|
415
|
+
setTimeout(() => reject(new TimeoutError(`Operation timed out after ${timeoutMs}ms`)), timeoutMs);
|
|
416
|
+
});
|
|
417
|
+
const { html, finalUrl } = await Promise.race([fetchPromise, timeoutPromise]);
|
|
418
|
+
// SECURITY: Limit HTML size
|
|
419
|
+
if (html.length > 10 * 1024 * 1024) { // 10MB limit
|
|
420
|
+
throw new WebPeelError('Response too large (max 10MB)');
|
|
421
|
+
}
|
|
422
|
+
if (!html || html.length < 100) {
|
|
423
|
+
throw new BlockedError('Empty or suspiciously small response from browser.');
|
|
424
|
+
}
|
|
425
|
+
return {
|
|
426
|
+
html,
|
|
427
|
+
url: finalUrl,
|
|
428
|
+
};
|
|
429
|
+
}
|
|
430
|
+
catch (error) {
|
|
431
|
+
if (error instanceof BlockedError || error instanceof WebPeelError || error instanceof TimeoutError) {
|
|
432
|
+
throw error;
|
|
433
|
+
}
|
|
434
|
+
if (error instanceof Error && error.message.includes('Timeout')) {
|
|
435
|
+
throw new TimeoutError(`Browser navigation timed out`);
|
|
436
|
+
}
|
|
437
|
+
throw new NetworkError(`Browser fetch failed: ${error instanceof Error ? error.message : 'Unknown error'}`);
|
|
438
|
+
}
|
|
439
|
+
finally {
|
|
440
|
+
// CRITICAL: Always close page and decrement counter
|
|
441
|
+
if (page) {
|
|
442
|
+
await page.close().catch(() => { });
|
|
443
|
+
}
|
|
444
|
+
activePagesCount--;
|
|
445
|
+
}
|
|
446
|
+
}
|
|
447
|
+
/**
|
|
448
|
+
* Retry a fetch operation with exponential backoff
|
|
449
|
+
*/
|
|
450
|
+
export async function retryFetch(fn, maxAttempts = 3, baseDelayMs = 1000) {
|
|
451
|
+
let lastError = null;
|
|
452
|
+
for (let attempt = 1; attempt <= maxAttempts; attempt++) {
|
|
453
|
+
try {
|
|
454
|
+
return await fn();
|
|
455
|
+
}
|
|
456
|
+
catch (error) {
|
|
457
|
+
lastError = error instanceof Error ? error : new Error('Unknown error');
|
|
458
|
+
// Don't retry on blocked errors or timeouts
|
|
459
|
+
if (error instanceof BlockedError || error instanceof TimeoutError) {
|
|
460
|
+
throw error;
|
|
461
|
+
}
|
|
462
|
+
if (attempt < maxAttempts) {
|
|
463
|
+
const delay = baseDelayMs * Math.pow(2, attempt - 1);
|
|
464
|
+
await new Promise((resolve) => setTimeout(resolve, delay));
|
|
465
|
+
}
|
|
466
|
+
}
|
|
467
|
+
}
|
|
468
|
+
throw lastError || new NetworkError('Retry failed');
|
|
469
|
+
}
|
|
470
|
+
/**
|
|
471
|
+
* Clean up browser resources
|
|
472
|
+
*/
|
|
473
|
+
export async function cleanup() {
|
|
474
|
+
if (sharedBrowser) {
|
|
475
|
+
await sharedBrowser.close();
|
|
476
|
+
sharedBrowser = null;
|
|
477
|
+
}
|
|
478
|
+
}
|
|
479
|
+
//# sourceMappingURL=fetcher.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"fetcher.js","sourceRoot":"","sources":["../../src/core/fetcher.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EAAE,QAAQ,EAA2B,MAAM,YAAY,CAAC;AAC/D,OAAO,EAAE,YAAY,EAAE,YAAY,EAAE,YAAY,EAAE,YAAY,EAAE,MAAM,aAAa,CAAC;AAErF,MAAM,WAAW,GAAG;IAClB,uHAAuH;IACvH,iHAAiH;IACjH,uHAAuH;IACvH,kFAAkF;IAClF,uGAAuG;CACxG,CAAC;AAEF,SAAS,kBAAkB;IACzB,OAAO,WAAW,CAAC,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,MAAM,EAAE,GAAG,WAAW,CAAC,MAAM,CAAC,CAAC,CAAC;AACrE,CAAC;AAED;;;GAGG;AACH,SAAS,WAAW,CAAC,SAAiB;IACpC,eAAe;IACf,IAAI,SAAS,CAAC,MAAM,GAAG,IAAI,EAAE,CAAC;QAC5B,MAAM,IAAI,YAAY,CAAC,oCAAoC,CAAC,CAAC;IAC/D,CAAC;IAED,uDAAuD;IACvD,IAAI,iBAAiB,CAAC,IAAI,CAAC,SAAS,CAAC,EAAE,CAAC;QACtC,MAAM,IAAI,YAAY,CAAC,yCAAyC,CAAC,CAAC;IACpE,CAAC;IAED,IAAI,GAAQ,CAAC;IACb,IAAI,CAAC;QACH,GAAG,GAAG,IAAI,GAAG,CAAC,SAAS,CAAC,CAAC;IAC3B,CAAC;IAAC,MAAM,CAAC;QACP,MAAM,IAAI,YAAY,CAAC,oBAAoB,CAAC,CAAC;IAC/C,CAAC;IAED,qBAAqB;IACrB,IAAI,CAAC,CAAC,OAAO,EAAE,QAAQ,CAAC,CAAC,QAAQ,CAAC,GAAG,CAAC,QAAQ,CAAC,EAAE,CAAC;QAChD,MAAM,IAAI,YAAY,CAAC,2CAA2C,CAAC,CAAC;IACtE,CAAC;IAED,iCAAiC;IACjC,IAAI,CAAC,GAAG,CAAC,QAAQ,EAAE,CAAC;QAClB,MAAM,IAAI,YAAY,CAAC,kBAAkB,CAAC,CAAC;IAC7C,CAAC;IAED,MAAM,QAAQ,GAAG,GAAG,CAAC,QAAQ,CAAC,WAAW,EAAE,CAAC;IAE5C,2BAA2B;IAC3B,MAAM,iBAAiB,GAAG,CAAC,WAAW,EAAE,SAAS,CAAC,CAAC;IACnD,IAAI,iBAAiB,CAAC,IAAI,CAAC,OAAO,CAAC,EAAE,CAAC,QAAQ,KAAK,OAAO,IAAI,QAAQ,CAAC,QAAQ,CAAC,GAAG,GAAG,OAAO,CAAC,CAAC,EAAE,CAAC;QAChG,MAAM,IAAI,YAAY,CAAC,oCAAoC,CAAC,CAAC;IAC/D,CAAC;IAED,iFAAiF;IACjF,MAAM,QAAQ,GAAG,oBAAoB,CAAC,QAAQ,CAAC,CAAC;IAChD,IAAI,QAAQ,EAAE,CAAC;QACb,mBAAmB,CAAC,QAAQ,CAAC,CAAC;IAChC,CAAC;IAED,0CAA0C;IAC1C,IAAI,QAAQ,CAAC,QAAQ,CAAC,GAAG,CAAC,EAAE,CAAC;QAC3B,mBAAmB,CAAC,QAAQ,CAAC,CAAC;IAChC,CAAC;AACH,CAAC;AAED;;;GAGG;AACH,SAAS,oBAAoB,CAAC,QAAgB;IAC5C,6BAA6B;IAC7B,MAAM,OAAO,GAAG,QAAQ,CAAC,OAAO,CAAC,UAAU,EAAE,EAAE,CAAC,CAAC;IAEjD,wCAAwC;IACxC,MAAM,WAAW,GAAG,8CAA8C,CAAC;IACnE,MAAM,WAAW,GAAG,OAAO,CAAC,KAAK,CAAC,WAAW,CAAC,CAAC;IAC/C,IAAI,WAAW,EAAE,CAAC;QAChB,MAAM,MAAM,GAAG,WAAW,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC;QAChD,IAAI,MAAM,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,GAAG,CAAC,EAAE,CAAC;YAC1C,OAAO,MAAM,CAAC;QAChB,CAAC;QACD,MAAM,IAAI,YAAY,CAAC,sBAAsB,CAAC,CAAC;IACjD,CAAC;IAED,2BAA2B;IAC3B,IAAI,kBAAkB,CAAC,IAAI,CAAC,OAAO,CAAC,EAAE,CAAC;QACrC,MAAM,GAAG,GAAG,QAAQ,CAAC,OAAO,EAAE,EAAE,CAAC,CAAC;QAClC,OAAO;YACL,CAAC,GAAG,KAAK,EAAE,CAAC,GAAG,IAAI;YACnB,CAAC,GAAG,KAAK,EAAE,CAAC,GAAG,IAAI;YACnB,CAAC,GAAG,KAAK,CAAC,CAAC,GAAG,IAAI;YAClB,GAAG,GAAG,IAAI;SACX,CAAC;IACJ,CAAC;IAED,wDAAwD;IACxD,IAAI,SAAS,CAAC,IAAI,CAAC,OAAO,CAAC,EAAE,CAAC;QAC5B,0BAA0B;QAC1B,IAAI,WAAW,CAAC,IAAI,CAAC,OAAO,CAAC,EAAE,CAAC;YAC9B,MAAM,GAAG,GAAG,QAAQ,CAAC,OAAO,EAAE,CAAC,CAAC,CAAC;YACjC,IAAI,GAAG,IAAI,UAAU,EAAE,CAAC;gBACtB,OAAO;oBACL,CAAC,GAAG,KAAK,EAAE,CAAC,GAAG,IAAI;oBACnB,CAAC,GAAG,KAAK,EAAE,CAAC,GAAG,IAAI;oBACnB,CAAC,GAAG,KAAK,CAAC,CAAC,GAAG,IAAI;oBAClB,GAAG,GAAG,IAAI;iBACX,CAAC;YACJ,CAAC;QACH,CAAC;QACD,kCAAkC;QAClC,MAAM,KAAK,GAAG,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;QACjC,IAAI,KAAK,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YACvB,MAAM,MAAM,GAAG,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,QAAQ,CAAC,CAAC,EAAE,SAAS,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;YACvE,IAAI,MAAM,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,GAAG,CAAC,EAAE,CAAC;gBAC1C,OAAO,MAAM,CAAC;YAChB,CAAC;QACH,CAAC;IACH,CAAC;IAED,+BAA+B;IAC/B,IAAI,OAAO,CAAC,IAAI,CAAC,OAAO,CAAC,EAAE,CAAC;QAC1B,MAAM,GAAG,GAAG,QAAQ,CAAC,OAAO,EAAE,EAAE,CAAC,CAAC;QAClC,IAAI,GAAG,IAAI,UAAU,EAAE,CAAC;YACtB,OAAO;gBACL,CAAC,GAAG,KAAK,EAAE,CAAC,GAAG,IAAI;gBACnB,CAAC,GAAG,KAAK,EAAE,CAAC,GAAG,IAAI;gBACnB,CAAC,GAAG,KAAK,CAAC,CAAC,GAAG,IAAI;gBAClB,GAAG,GAAG,IAAI;aACX,CAAC;QACJ,CAAC;IACH,CAAC;IAED,OAAO,IAAI,CAAC;AACd,CAAC;AAED;;GAEG;AACH,SAAS,mBAAmB,CAAC,MAAgB;IAC3C,MAAM,CAAC,CAAC,EAAE,CAAC,EAAE,CAAC,EAAE,CAAC,CAAC,GAAG,MAAM,CAAC;IAE5B,wBAAwB;IACxB,IAAI,CAAC,KAAK,GAAG,EAAE,CAAC;QACd,MAAM,IAAI,YAAY,CAAC,6CAA6C,CAAC,CAAC;IACxE,CAAC;IAED,sBAAsB;IACtB,IAAI,CAAC,KAAK,EAAE,EAAE,CAAC;QACb,MAAM,IAAI,YAAY,CAAC,+CAA+C,CAAC,CAAC;IAC1E,CAAC;IAED,yBAAyB;IACzB,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,IAAI,EAAE,IAAI,CAAC,IAAI,EAAE,EAAE,CAAC;QACpC,MAAM,IAAI,YAAY,CAAC,+CAA+C,CAAC,CAAC;IAC1E,CAAC;IAED,0BAA0B;IAC1B,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,KAAK,GAAG,EAAE,CAAC;QAC3B,MAAM,IAAI,YAAY,CAAC,+CAA+C,CAAC,CAAC;IAC1E,CAAC;IAED,6BAA6B;IAC7B,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,KAAK,GAAG,EAAE,CAAC;QAC3B,MAAM,IAAI,YAAY,CAAC,+CAA+C,CAAC,CAAC;IAC1E,CAAC;IAED,6BAA6B;IAC7B,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,KAAK,GAAG,EAAE,CAAC;QACrD,MAAM,IAAI,YAAY,CAAC,4CAA4C,CAAC,CAAC;IACvE,CAAC;IAED,0BAA0B;IAC1B,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC;QACZ,MAAM,IAAI,YAAY,CAAC,mDAAmD,CAAC,CAAC;IAC9E,CAAC;AACH,CAAC;AAED;;GAEG;AACH,SAAS,mBAAmB,CAAC,QAAgB;IAC3C,kBAAkB;IAClB,MAAM,IAAI,GAAG,QAAQ,CAAC,OAAO,CAAC,UAAU,EAAE,EAAE,CAAC,CAAC,WAAW,EAAE,CAAC;IAE5D,gBAAgB;IAChB,IAAI,IAAI,KAAK,KAAK,IAAI,IAAI,KAAK,iBAAiB,EAAE,CAAC;QACjD,MAAM,IAAI,YAAY,CAAC,6CAA6C,CAAC,CAAC;IACxE,CAAC;IAED,2DAA2D;IAC3D,IAAI,IAAI,CAAC,UAAU,CAAC,SAAS,CAAC,EAAE,CAAC;QAC/B,wBAAwB;QACxB,MAAM,QAAQ,GAAG,IAAI,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC;QAEnC,iEAAiE;QACjE,IAAI,QAAQ,CAAC,QAAQ,CAAC,GAAG,CAAC,EAAE,CAAC;YAC3B,oBAAoB;YACpB,MAAM,KAAK,GAAG,QAAQ,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;YAClC,IAAI,KAAK,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;gBACvB,MAAM,MAAM,GAAG,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,QAAQ,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC;gBAC/C,IAAI,MAAM,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,GAAG,CAAC,EAAE,CAAC;oBACvD,mBAAmB,CAAC,MAAM,CAAC,CAAC;gBAC9B,CAAC;YACH,CAAC;QACH,CAAC;aAAM,CAAC;YACN,gDAAgD;YAChD,MAAM,MAAM,GAAG,QAAQ,CAAC,OAAO,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC;YAC1C,IAAI,iBAAiB,CAAC,IAAI,CAAC,MAAM,CAAC,EAAE,CAAC;gBACnC,MAAM,GAAG,GAAG,QAAQ,CAAC,MAAM,EAAE,EAAE,CAAC,CAAC;gBACjC,MAAM,MAAM,GAAG;oBACb,CAAC,GAAG,KAAK,EAAE,CAAC,GAAG,IAAI;oBACnB,CAAC,GAAG,KAAK,EAAE,CAAC,GAAG,IAAI;oBACnB,CAAC,GAAG,KAAK,CAAC,CAAC,GAAG,IAAI;oBAClB,GAAG,GAAG,IAAI;iBACX,CAAC;gBACF,mBAAmB,CAAC,MAAM,CAAC,CAAC;YAC9B,CAAC;QACH,CAAC;QACD,MAAM,IAAI,YAAY,CAAC,qDAAqD,CAAC,CAAC;IAChF,CAAC;IAED,sDAAsD;IACtD,IAAI,IAAI,CAAC,UAAU,CAAC,IAAI,CAAC,IAAI,IAAI,CAAC,UAAU,CAAC,IAAI,CAAC,EAAE,CAAC;QACnD,MAAM,IAAI,YAAY,CAAC,sDAAsD,CAAC,CAAC;IACjF,CAAC;IAED,wBAAwB;IACxB,IAAI,IAAI,CAAC,UAAU,CAAC,KAAK,CAAC,IAAI,IAAI,CAAC,UAAU,CAAC,KAAK,CAAC;QAChD,IAAI,CAAC,UAAU,CAAC,KAAK,CAAC,IAAI,IAAI,CAAC,UAAU,CAAC,KAAK,CAAC,EAAE,CAAC;QACrD,MAAM,IAAI,YAAY,CAAC,oDAAoD,CAAC,CAAC;IAC/E,CAAC;AACH,CAAC;AAED;;GAEG;AACH,SAAS,iBAAiB,CAAC,SAAiB;IAC1C,IAAI,SAAS,CAAC,MAAM,GAAG,GAAG,EAAE,CAAC;QAC3B,MAAM,IAAI,YAAY,CAAC,0CAA0C,CAAC,CAAC;IACrE,CAAC;IACD,wCAAwC;IACxC,IAAI,CAAC,gBAAgB,CAAC,IAAI,CAAC,SAAS,CAAC,EAAE,CAAC;QACtC,MAAM,IAAI,YAAY,CAAC,wCAAwC,CAAC,CAAC;IACnE,CAAC;IACD,OAAO,SAAS,CAAC;AACnB,CAAC;AAQD;;;;GAIG;AACH,MAAM,CAAC,KAAK,UAAU,WAAW,CAC/B,GAAW,EACX,SAAkB,EAClB,YAAoB,KAAK;IAEzB,yCAAyC;IACzC,WAAW,CAAC,GAAG,CAAC,CAAC;IAEjB,kCAAkC;IAClC,MAAM,kBAAkB,GAAG,SAAS,CAAC,CAAC,CAAC,iBAAiB,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,kBAAkB,EAAE,CAAC;IAE3F,MAAM,aAAa,GAAG,EAAE,CAAC;IACzB,IAAI,aAAa,GAAG,CAAC,CAAC;IACtB,IAAI,UAAU,GAAG,GAAG,CAAC;IACrB,MAAM,QAAQ,GAAG,IAAI,GAAG,EAAU,CAAC;IAEnC,OAAO,aAAa,IAAI,aAAa,EAAE,CAAC;QACtC,wBAAwB;QACxB,IAAI,QAAQ,CAAC,GAAG,CAAC,UAAU,CAAC,EAAE,CAAC;YAC7B,MAAM,IAAI,YAAY,CAAC,wBAAwB,CAAC,CAAC;QACnD,CAAC;QACD,QAAQ,CAAC,GAAG,CAAC,UAAU,CAAC,CAAC;QAEzB,+BAA+B;QAC/B,WAAW,CAAC,UAAU,CAAC,CAAC;QAExB,MAAM,UAAU,GAAG,IAAI,eAAe,EAAE,CAAC;QACzC,MAAM,KAAK,GAAG,UAAU,CAAC,GAAG,EAAE,CAAC,UAAU,CAAC,KAAK,EAAE,EAAE,SAAS,CAAC,CAAC;QAE9D,IAAI,CAAC;YACH,MAAM,QAAQ,GAAG,MAAM,KAAK,CAAC,UAAU,EAAE;gBACvC,OAAO,EAAE;oBACP,YAAY,EAAE,kBAAkB;oBAChC,QAAQ,EAAE,4EAA4E;oBACtF,iBAAiB,EAAE,gBAAgB;oBACnC,iBAAiB,EAAE,mBAAmB;oBACtC,KAAK,EAAE,GAAG;oBACV,YAAY,EAAE,YAAY;oBAC1B,2BAA2B,EAAE,GAAG;iBACjC;gBACD,MAAM,EAAE,UAAU,CAAC,MAAM;gBACzB,QAAQ,EAAE,QAAQ,EAAE,qCAAqC;aAC1D,CAAC,CAAC;YAEH,YAAY,CAAC,KAAK,CAAC,CAAC;YAEpB,4BAA4B;YAC5B,IAAI,QAAQ,CAAC,MAAM,IAAI,GAAG,IAAI,QAAQ,CAAC,MAAM,GAAG,GAAG,EAAE,CAAC;gBACpD,MAAM,QAAQ,GAAG,QAAQ,CAAC,OAAO,CAAC,GAAG,CAAC,UAAU,CAAC,CAAC;gBAClD,IAAI,CAAC,QAAQ,EAAE,CAAC;oBACd,MAAM,IAAI,YAAY,CAAC,2CAA2C,CAAC,CAAC;gBACtE,CAAC;gBAED,wBAAwB;gBACxB,UAAU,GAAG,IAAI,GAAG,CAAC,QAAQ,EAAE,UAAU,CAAC,CAAC,IAAI,CAAC;gBAChD,aAAa,EAAE,CAAC;gBAChB,SAAS;YACX,CAAC;YAED,IAAI,CAAC,QAAQ,CAAC,EAAE,EAAE,CAAC;gBACjB,IAAI,QAAQ,CAAC,MAAM,KAAK,GAAG,IAAI,QAAQ,CAAC,MAAM,KAAK,GAAG,EAAE,CAAC;oBACvD,MAAM,IAAI,YAAY,CACpB,QAAQ,QAAQ,CAAC,MAAM,iEAAiE,CACzF,CAAC;gBACJ,CAAC;gBACD,MAAM,IAAI,YAAY,CAAC,QAAQ,QAAQ,CAAC,MAAM,KAAK,QAAQ,CAAC,UAAU,EAAE,CAAC,CAAC;YAC5E,CAAC;YAED,kCAAkC;YAClC,MAAM,WAAW,GAAG,QAAQ,CAAC,OAAO,CAAC,GAAG,CAAC,cAAc,CAAC,IAAI,EAAE,CAAC;YAC/D,IAAI,CAAC,WAAW,CAAC,QAAQ,CAAC,WAAW,CAAC,IAAI,CAAC,WAAW,CAAC,QAAQ,CAAC,uBAAuB,CAAC,EAAE,CAAC;gBACzF,MAAM,IAAI,YAAY,CAAC,mDAAmD,CAAC,CAAC;YAC9E,CAAC;YAED,wEAAwE;YACxE,MAAM,MAAM,GAAiB,EAAE,CAAC;YAChC,IAAI,SAAS,GAAG,CAAC,CAAC;YAClB,MAAM,QAAQ,GAAG,EAAE,GAAG,IAAI,GAAG,IAAI,CAAC,CAAC,OAAO;YAE1C,MAAM,MAAM,GAAG,QAAQ,CAAC,IAAI,EAAE,SAAS,EAAE,CAAC;YAC1C,IAAI,CAAC,MAAM,EAAE,CAAC;gBACZ,MAAM,IAAI,YAAY,CAAC,+BAA+B,CAAC,CAAC;YAC1D,CAAC;YAED,IAAI,CAAC;gBACH,OAAO,IAAI,EAAE,CAAC;oBACZ,MAAM,EAAE,IAAI,EAAE,KAAK,EAAE,GAAG,MAAM,MAAM,CAAC,IAAI,EAAE,CAAC;oBAC5C,IAAI,IAAI;wBAAE,MAAM;oBAEhB,SAAS,IAAI,KAAK,CAAC,MAAM,CAAC;oBAC1B,IAAI,SAAS,GAAG,QAAQ,EAAE,CAAC;wBACzB,MAAM,CAAC,MAAM,EAAE,CAAC;wBAChB,MAAM,IAAI,YAAY,CAAC,+BAA+B,CAAC,CAAC;oBAC1D,CAAC;oBAED,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;gBACrB,CAAC;YACH,CAAC;oBAAS,CAAC;gBACT,MAAM,CAAC,WAAW,EAAE,CAAC;YACvB,CAAC;YAED,iBAAiB;YACjB,MAAM,QAAQ,GAAG,IAAI,UAAU,CAAC,SAAS,CAAC,CAAC;YAC3C,IAAI,MAAM,GAAG,CAAC,CAAC;YACf,KAAK,MAAM,KAAK,IAAI,MAAM,EAAE,CAAC;gBAC3B,QAAQ,CAAC,GAAG,CAAC,KAAK,EAAE,MAAM,CAAC,CAAC;gBAC5B,MAAM,IAAI,KAAK,CAAC,MAAM,CAAC;YACzB,CAAC;YAED,MAAM,IAAI,GAAG,IAAI,WAAW,EAAE,CAAC,MAAM,CAAC,QAAQ,CAAC,CAAC;YAEhD,IAAI,CAAC,IAAI,IAAI,IAAI,CAAC,MAAM,GAAG,GAAG,EAAE,CAAC;gBAC/B,MAAM,IAAI,YAAY,CAAC,oEAAoE,CAAC,CAAC;YAC/F,CAAC;YAED,iCAAiC;YACjC,IAAI,IAAI,CAAC,QAAQ,CAAC,yBAAyB,CAAC,IAAI,IAAI,CAAC,QAAQ,CAAC,kBAAkB,CAAC,EAAE,CAAC;gBAClF,MAAM,IAAI,YAAY,CAAC,+DAA+D,CAAC,CAAC;YAC1F,CAAC;YAED,OAAO;gBACL,IAAI;gBACJ,GAAG,EAAE,UAAU;gBACf,UAAU,EAAE,QAAQ,CAAC,MAAM;aAC5B,CAAC;QACJ,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,YAAY,CAAC,KAAK,CAAC,CAAC;YAEpB,IAAI,KAAK,YAAY,YAAY,IAAI,KAAK,YAAY,YAAY,IAAI,KAAK,YAAY,YAAY,EAAE,CAAC;gBACpG,MAAM,KAAK,CAAC;YACd,CAAC;YAED,IAAI,KAAK,YAAY,KAAK,IAAI,KAAK,CAAC,IAAI,KAAK,YAAY,EAAE,CAAC;gBAC1D,MAAM,IAAI,YAAY,CAAC,2BAA2B,SAAS,IAAI,CAAC,CAAC;YACnE,CAAC;YAED,MAAM,IAAI,YAAY,CAAC,oBAAoB,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,eAAe,EAAE,CAAC,CAAC;QACzG,CAAC;IACH,CAAC;IAED,MAAM,IAAI,YAAY,CAAC,2BAA2B,aAAa,GAAG,CAAC,CAAC;AACtE,CAAC;AAED,IAAI,aAAa,GAAmB,IAAI,CAAC;AACzC,IAAI,gBAAgB,GAAG,CAAC,CAAC;AACzB,MAAM,oBAAoB,GAAG,CAAC,CAAC;AAE/B,KAAK,UAAU,UAAU;IACvB,4DAA4D;IAC5D,IAAI,aAAa,EAAE,CAAC;QAClB,IAAI,CAAC;YACH,IAAI,aAAa,CAAC,WAAW,EAAE,EAAE,CAAC;gBAChC,OAAO,aAAa,CAAC;YACvB,CAAC;QACH,CAAC;QAAC,MAAM,CAAC;YACP,4BAA4B;YAC5B,aAAa,GAAG,IAAI,CAAC;QACvB,CAAC;IACH,CAAC;IAED,aAAa,GAAG,MAAM,QAAQ,CAAC,MAAM,CAAC,EAAE,QAAQ,EAAE,IAAI,EAAE,CAAC,CAAC;IAC1D,OAAO,aAAa,CAAC;AACvB,CAAC;AAED;;;GAGG;AACH,MAAM,CAAC,KAAK,UAAU,YAAY,CAChC,GAAW,EACX,UAII,EAAE;IAEN,yCAAyC;IACzC,WAAW,CAAC,GAAG,CAAC,CAAC;IAEjB,MAAM,EAAE,SAAS,EAAE,MAAM,GAAG,CAAC,EAAE,SAAS,GAAG,KAAK,EAAE,GAAG,OAAO,CAAC;IAE7D,kCAAkC;IAClC,MAAM,kBAAkB,GAAG,SAAS,CAAC,CAAC,CAAC,iBAAiB,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,kBAAkB,EAAE,CAAC;IAE3F,qBAAqB;IACrB,IAAI,MAAM,GAAG,CAAC,IAAI,MAAM,GAAG,KAAK,EAAE,CAAC;QACjC,MAAM,IAAI,YAAY,CAAC,yCAAyC,CAAC,CAAC;IACpE,CAAC;IAED,wDAAwD;IACxD,MAAM,cAAc,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;IAClC,MAAM,gBAAgB,GAAG,KAAK,CAAC,CAAC,qBAAqB;IAErD,OAAO,gBAAgB,IAAI,oBAAoB,EAAE,CAAC;QAChD,IAAI,IAAI,CAAC,GAAG,EAAE,GAAG,cAAc,GAAG,gBAAgB,EAAE,CAAC;YACnD,MAAM,IAAI,YAAY,CAAC,2DAA2D,CAAC,CAAC;QACtF,CAAC;QACD,MAAM,IAAI,OAAO,CAAC,OAAO,CAAC,EAAE,CAAC,UAAU,CAAC,OAAO,EAAE,GAAG,CAAC,CAAC,CAAC;IACzD,CAAC;IAED,gBAAgB,EAAE,CAAC;IACnB,IAAI,IAAI,GAAgB,IAAI,CAAC;IAE7B,IAAI,CAAC;QACH,MAAM,OAAO,GAAG,MAAM,UAAU,EAAE,CAAC;QACnC,IAAI,GAAG,MAAM,OAAO,CAAC,OAAO,CAAC;YAC3B,SAAS,EAAE,kBAAkB;SAC9B,CAAC,CAAC;QAEH,2DAA2D;QAC3D,MAAM,IAAI,CAAC,KAAK,CAAC,MAAM,EAAE,CAAC,KAAK,EAAE,EAAE;YACjC,MAAM,YAAY,GAAG,KAAK,CAAC,OAAO,EAAE,CAAC,YAAY,EAAE,CAAC;YACpD,IAAI,CAAC,OAAO,EAAE,MAAM,EAAE,OAAO,EAAE,YAAY,CAAC,CAAC,QAAQ,CAAC,YAAY,CAAC,EAAE,CAAC;gBACpE,KAAK,CAAC,KAAK,EAAE,CAAC;YAChB,CAAC;iBAAM,CAAC;gBACN,KAAK,CAAC,QAAQ,EAAE,CAAC;YACnB,CAAC;QACH,CAAC,CAAC,CAAC;QAEH,6CAA6C;QAC7C,MAAM,YAAY,GAAG,CAAC,KAAK,IAAI,EAAE;YAC/B,MAAM,IAAK,CAAC,IAAI,CAAC,GAAG,EAAE;gBACpB,SAAS,EAAE,kBAAkB;gBAC7B,OAAO,EAAE,SAAS;aACnB,CAAC,CAAC;YAEH,8DAA8D;YAC9D,IAAI,MAAM,GAAG,CAAC,EAAE,CAAC;gBACf,MAAM,IAAK,CAAC,cAAc,CAAC,MAAM,CAAC,CAAC;YACrC,CAAC;YAED,MAAM,IAAI,GAAG,MAAM,IAAK,CAAC,OAAO,EAAE,CAAC;YACnC,MAAM,QAAQ,GAAG,IAAK,CAAC,GAAG,EAAE,CAAC;YAE7B,OAAO,EAAE,IAAI,EAAE,QAAQ,EAAE,CAAC;QAC5B,CAAC,CAAC,EAAE,CAAC;QAEL,MAAM,cAAc,GAAG,IAAI,OAAO,CAAQ,CAAC,CAAC,EAAE,MAAM,EAAE,EAAE;YACtD,UAAU,CAAC,GAAG,EAAE,CAAC,MAAM,CAAC,IAAI,YAAY,CAAC,6BAA6B,SAAS,IAAI,CAAC,CAAC,EAAE,SAAS,CAAC,CAAC;QACpG,CAAC,CAAC,CAAC;QAEH,MAAM,EAAE,IAAI,EAAE,QAAQ,EAAE,GAAG,MAAM,OAAO,CAAC,IAAI,CAAC,CAAC,YAAY,EAAE,cAAc,CAAC,CAAC,CAAC;QAE9E,4BAA4B;QAC5B,IAAI,IAAI,CAAC,MAAM,GAAG,EAAE,GAAG,IAAI,GAAG,IAAI,EAAE,CAAC,CAAC,aAAa;YACjD,MAAM,IAAI,YAAY,CAAC,+BAA+B,CAAC,CAAC;QAC1D,CAAC;QAED,IAAI,CAAC,IAAI,IAAI,IAAI,CAAC,MAAM,GAAG,GAAG,EAAE,CAAC;YAC/B,MAAM,IAAI,YAAY,CAAC,oDAAoD,CAAC,CAAC;QAC/E,CAAC;QAED,OAAO;YACL,IAAI;YACJ,GAAG,EAAE,QAAQ;SACd,CAAC;IACJ,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,IAAI,KAAK,YAAY,YAAY,IAAI,KAAK,YAAY,YAAY,IAAI,KAAK,YAAY,YAAY,EAAE,CAAC;YACpG,MAAM,KAAK,CAAC;QACd,CAAC;QAED,IAAI,KAAK,YAAY,KAAK,IAAI,KAAK,CAAC,OAAO,CAAC,QAAQ,CAAC,SAAS,CAAC,EAAE,CAAC;YAChE,MAAM,IAAI,YAAY,CAAC,8BAA8B,CAAC,CAAC;QACzD,CAAC;QAED,MAAM,IAAI,YAAY,CACpB,yBAAyB,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,eAAe,EAAE,CACpF,CAAC;IACJ,CAAC;YAAS,CAAC;QACT,oDAAoD;QACpD,IAAI,IAAI,EAAE,CAAC;YACT,MAAM,IAAI,CAAC,KAAK,EAAE,CAAC,KAAK,CAAC,GAAG,EAAE,GAAE,CAAC,CAAC,CAAC;QACrC,CAAC;QACD,gBAAgB,EAAE,CAAC;IACrB,CAAC;AACH,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,UAAU,CAC9B,EAAoB,EACpB,cAAsB,CAAC,EACvB,cAAsB,IAAI;IAE1B,IAAI,SAAS,GAAiB,IAAI,CAAC;IAEnC,KAAK,IAAI,OAAO,GAAG,CAAC,EAAE,OAAO,IAAI,WAAW,EAAE,OAAO,EAAE,EAAE,CAAC;QACxD,IAAI,CAAC;YACH,OAAO,MAAM,EAAE,EAAE,CAAC;QACpB,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,SAAS,GAAG,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,IAAI,KAAK,CAAC,eAAe,CAAC,CAAC;YAExE,4CAA4C;YAC5C,IAAI,KAAK,YAAY,YAAY,IAAI,KAAK,YAAY,YAAY,EAAE,CAAC;gBACnE,MAAM,KAAK,CAAC;YACd,CAAC;YAED,IAAI,OAAO,GAAG,WAAW,EAAE,CAAC;gBAC1B,MAAM,KAAK,GAAG,WAAW,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,OAAO,GAAG,CAAC,CAAC,CAAC;gBACrD,MAAM,IAAI,OAAO,CAAC,CAAC,OAAO,EAAE,EAAE,CAAC,UAAU,CAAC,OAAO,EAAE,KAAK,CAAC,CAAC,CAAC;YAC7D,CAAC;QACH,CAAC;IACH,CAAC;IAED,MAAM,SAAS,IAAI,IAAI,YAAY,CAAC,cAAc,CAAC,CAAC;AACtD,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,OAAO;IAC3B,IAAI,aAAa,EAAE,CAAC;QAClB,MAAM,aAAa,CAAC,KAAK,EAAE,CAAC;QAC5B,aAAa,GAAG,IAAI,CAAC;IACvB,CAAC;AACH,CAAC"}
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* HTML to Markdown conversion with smart cleanup
|
|
3
|
+
*/
|
|
4
|
+
/**
|
|
5
|
+
* Convert HTML to clean, readable Markdown
|
|
6
|
+
*/
|
|
7
|
+
export declare function htmlToMarkdown(html: string): string;
|
|
8
|
+
/**
|
|
9
|
+
* Convert HTML to plain text (strip all formatting)
|
|
10
|
+
*/
|
|
11
|
+
export declare function htmlToText(html: string): string;
|
|
12
|
+
/**
|
|
13
|
+
* Estimate token count (very rough approximation)
|
|
14
|
+
* Rule of thumb: 1 token ≈ 4 characters for English text
|
|
15
|
+
*/
|
|
16
|
+
export declare function estimateTokens(text: string): number;
|
|
17
|
+
//# sourceMappingURL=markdown.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"markdown.d.ts","sourceRoot":"","sources":["../../src/core/markdown.ts"],"names":[],"mappings":"AAAA;;GAEG;AA8DH;;GAEG;AACH,wBAAgB,cAAc,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CA4DnD;AAED;;GAEG;AACH,wBAAgB,UAAU,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAuB/C;AAED;;;GAGG;AACH,wBAAgB,cAAc,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAEnD"}
|
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* HTML to Markdown conversion with smart cleanup
|
|
3
|
+
*/
|
|
4
|
+
import TurndownService from 'turndown';
|
|
5
|
+
import * as cheerio from 'cheerio';
|
|
6
|
+
const JUNK_SELECTORS = [
|
|
7
|
+
'script',
|
|
8
|
+
'style',
|
|
9
|
+
'nav',
|
|
10
|
+
'footer',
|
|
11
|
+
'header.site-header',
|
|
12
|
+
'aside',
|
|
13
|
+
'.sidebar',
|
|
14
|
+
'.advertisement',
|
|
15
|
+
'.ad',
|
|
16
|
+
'.cookie-banner',
|
|
17
|
+
'.cookie-notice',
|
|
18
|
+
'.newsletter-signup',
|
|
19
|
+
'.social-share',
|
|
20
|
+
'.related-posts',
|
|
21
|
+
'.comments',
|
|
22
|
+
'#comments',
|
|
23
|
+
'.cookie-consent',
|
|
24
|
+
'[class*="cookie"]',
|
|
25
|
+
'[id*="cookie"]',
|
|
26
|
+
'[class*="banner"]',
|
|
27
|
+
'[class*="popup"]',
|
|
28
|
+
'[class*="modal"]',
|
|
29
|
+
];
|
|
30
|
+
/**
|
|
31
|
+
* Clean HTML before conversion
|
|
32
|
+
* Remove navigation, ads, cookie banners, and other junk
|
|
33
|
+
*/
|
|
34
|
+
function cleanHTML(html) {
|
|
35
|
+
// SECURITY: Limit HTML size to prevent DoS
|
|
36
|
+
if (html.length > 10 * 1024 * 1024) { // 10MB
|
|
37
|
+
throw new Error('HTML too large to process (max 10MB)');
|
|
38
|
+
}
|
|
39
|
+
const $ = cheerio.load(html);
|
|
40
|
+
// Remove junk elements
|
|
41
|
+
JUNK_SELECTORS.forEach((selector) => {
|
|
42
|
+
$(selector).remove();
|
|
43
|
+
});
|
|
44
|
+
// Remove empty paragraphs and divs
|
|
45
|
+
$('p:empty, div:empty').remove();
|
|
46
|
+
// Remove elements with only whitespace
|
|
47
|
+
$('*').each((_, elem) => {
|
|
48
|
+
const $elem = $(elem);
|
|
49
|
+
const text = $elem.text().trim();
|
|
50
|
+
if (!text && $elem.children().length === 0) {
|
|
51
|
+
$elem.remove();
|
|
52
|
+
}
|
|
53
|
+
});
|
|
54
|
+
return $.html();
|
|
55
|
+
}
|
|
56
|
+
/**
|
|
57
|
+
* Convert HTML to clean, readable Markdown
|
|
58
|
+
*/
|
|
59
|
+
export function htmlToMarkdown(html) {
|
|
60
|
+
const cleanedHTML = cleanHTML(html);
|
|
61
|
+
const turndown = new TurndownService({
|
|
62
|
+
headingStyle: 'atx',
|
|
63
|
+
codeBlockStyle: 'fenced',
|
|
64
|
+
bulletListMarker: '-',
|
|
65
|
+
emDelimiter: '_',
|
|
66
|
+
strongDelimiter: '**',
|
|
67
|
+
});
|
|
68
|
+
// Preserve tables
|
|
69
|
+
turndown.keep(['table', 'thead', 'tbody', 'tr', 'th', 'td']);
|
|
70
|
+
// Custom rule: convert images to alt text or skip
|
|
71
|
+
turndown.addRule('images', {
|
|
72
|
+
filter: 'img',
|
|
73
|
+
replacement: (_content, node) => {
|
|
74
|
+
const alt = node.alt;
|
|
75
|
+
const src = node.src;
|
|
76
|
+
if (alt) {
|
|
77
|
+
return ``;
|
|
78
|
+
}
|
|
79
|
+
return '';
|
|
80
|
+
},
|
|
81
|
+
});
|
|
82
|
+
// Custom rule: preserve code blocks
|
|
83
|
+
turndown.addRule('codeBlocks', {
|
|
84
|
+
filter: (node) => {
|
|
85
|
+
return node.nodeName === 'PRE' && node.firstChild?.nodeName === 'CODE';
|
|
86
|
+
},
|
|
87
|
+
replacement: (_content, node) => {
|
|
88
|
+
const codeNode = node.firstChild;
|
|
89
|
+
const className = codeNode.getAttribute('class') || '';
|
|
90
|
+
const language = className.match(/language-(\w+)/)?.[1] || '';
|
|
91
|
+
return '\n\n```' + language + '\n' + codeNode.textContent + '\n```\n\n';
|
|
92
|
+
},
|
|
93
|
+
});
|
|
94
|
+
let markdown = turndown.turndown(cleanedHTML);
|
|
95
|
+
// SECURITY: Protect against ReDoS - limit input size before regex
|
|
96
|
+
if (markdown.length > 1024 * 1024) { // 1MB limit for markdown
|
|
97
|
+
markdown = markdown.slice(0, 1024 * 1024);
|
|
98
|
+
}
|
|
99
|
+
// Clean up excessive newlines (use non-backtracking approach)
|
|
100
|
+
markdown = markdown.split('\n').reduce((acc, line, i, arr) => {
|
|
101
|
+
if (i === 0)
|
|
102
|
+
return line;
|
|
103
|
+
const prevEmpty = arr[i - 1].trim() === '';
|
|
104
|
+
const currEmpty = line.trim() === '';
|
|
105
|
+
if (prevEmpty && currEmpty)
|
|
106
|
+
return acc;
|
|
107
|
+
return acc + '\n' + line;
|
|
108
|
+
}, '');
|
|
109
|
+
// Remove leading/trailing whitespace
|
|
110
|
+
markdown = markdown.trim();
|
|
111
|
+
return markdown;
|
|
112
|
+
}
|
|
113
|
+
/**
|
|
114
|
+
* Convert HTML to plain text (strip all formatting)
|
|
115
|
+
*/
|
|
116
|
+
export function htmlToText(html) {
|
|
117
|
+
const cleanedHTML = cleanHTML(html);
|
|
118
|
+
const $ = cheerio.load(cleanedHTML);
|
|
119
|
+
// Get text content, preserving some structure
|
|
120
|
+
let text = '';
|
|
121
|
+
$('h1, h2, h3, h4, h5, h6, p, li').each((_, elem) => {
|
|
122
|
+
const content = $(elem).text().trim();
|
|
123
|
+
if (content) {
|
|
124
|
+
text += content + '\n\n';
|
|
125
|
+
}
|
|
126
|
+
});
|
|
127
|
+
// Fallback: if no structured content found, get all text
|
|
128
|
+
if (!text.trim()) {
|
|
129
|
+
text = $('body').text();
|
|
130
|
+
}
|
|
131
|
+
// Clean up excessive whitespace
|
|
132
|
+
text = text.replace(/\n{3,}/g, '\n\n');
|
|
133
|
+
text = text.replace(/[ \t]+/g, ' ');
|
|
134
|
+
return text.trim();
|
|
135
|
+
}
|
|
136
|
+
/**
|
|
137
|
+
* Estimate token count (very rough approximation)
|
|
138
|
+
* Rule of thumb: 1 token ≈ 4 characters for English text
|
|
139
|
+
*/
|
|
140
|
+
export function estimateTokens(text) {
|
|
141
|
+
return Math.ceil(text.length / 4);
|
|
142
|
+
}
|
|
143
|
+
//# sourceMappingURL=markdown.js.map
|