@vakra-dev/reader 0.0.2 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +1 -1
- package/README.md +29 -26
- package/dist/cli/index.js +1356 -1039
- package/dist/cli/index.js.map +1 -1
- package/dist/index.d.ts +233 -50
- package/dist/index.js +1591 -1042
- package/dist/index.js.map +1 -1
- package/package.json +2 -1
package/dist/index.d.ts
CHANGED
|
@@ -90,6 +90,20 @@ interface IBrowserPool {
|
|
|
90
90
|
healthCheck?(): Promise<HealthStatus>;
|
|
91
91
|
}
|
|
92
92
|
|
|
93
|
+
/**
|
|
94
|
+
* Engine types for multi-engine scraping architecture
|
|
95
|
+
*
|
|
96
|
+
* Engine stack (in order of preference):
|
|
97
|
+
* 1. http - Native fetch, fastest, no browser
|
|
98
|
+
* 2. tlsclient - TLS fingerprinting via got-scraping
|
|
99
|
+
* 3. hero - Full browser with JavaScript execution
|
|
100
|
+
*/
|
|
101
|
+
|
|
102
|
+
/**
|
|
103
|
+
* Available engine names
|
|
104
|
+
*/
|
|
105
|
+
type EngineName = "http" | "tlsclient" | "hero";
|
|
106
|
+
|
|
93
107
|
/**
|
|
94
108
|
* Proxy configuration for Hero
|
|
95
109
|
*/
|
|
@@ -139,12 +153,12 @@ interface BrowserPoolConfig {
|
|
|
139
153
|
interface ScrapeOptions {
|
|
140
154
|
/** Array of URLs to scrape */
|
|
141
155
|
urls: string[];
|
|
142
|
-
/** Output formats (default: ['markdown']) */
|
|
143
|
-
formats?: Array<"markdown" | "html"
|
|
144
|
-
/** Include URL, title, timestamp (default: true) */
|
|
145
|
-
includeMetadata?: boolean;
|
|
156
|
+
/** Output formats - which content fields to include (default: ['markdown']) */
|
|
157
|
+
formats?: Array<"markdown" | "html">;
|
|
146
158
|
/** Custom user agent string */
|
|
147
159
|
userAgent?: string;
|
|
160
|
+
/** Custom headers for requests */
|
|
161
|
+
headers?: Record<string, string>;
|
|
148
162
|
/** Request timeout in milliseconds (default: 30000) */
|
|
149
163
|
timeoutMs?: number;
|
|
150
164
|
/** URL patterns to include (regex strings) */
|
|
@@ -155,6 +169,12 @@ interface ScrapeOptions {
|
|
|
155
169
|
removeAds?: boolean;
|
|
156
170
|
/** Remove base64-encoded images to reduce output size (default: true) */
|
|
157
171
|
removeBase64Images?: boolean;
|
|
172
|
+
/** Extract only main content, removing nav/header/footer/sidebar (default: true) */
|
|
173
|
+
onlyMainContent?: boolean;
|
|
174
|
+
/** CSS selectors for elements to include (if set, only these elements are kept) */
|
|
175
|
+
includeTags?: string[];
|
|
176
|
+
/** CSS selectors for elements to exclude (removed from output) */
|
|
177
|
+
excludeTags?: string[];
|
|
158
178
|
/** Skip TLS/SSL certificate verification (default: true) */
|
|
159
179
|
skipTLSVerification?: boolean;
|
|
160
180
|
/** Number of URLs to process in parallel (default: 1 - sequential) */
|
|
@@ -183,6 +203,12 @@ interface ScrapeOptions {
|
|
|
183
203
|
browserPool?: BrowserPoolConfig;
|
|
184
204
|
/** Browser pool instance (internal, provided by ReaderClient) */
|
|
185
205
|
pool?: IBrowserPool;
|
|
206
|
+
/** Engines to use in order (default: ['http', 'tlsclient', 'hero']) */
|
|
207
|
+
engines?: EngineName[];
|
|
208
|
+
/** Skip specific engines (e.g., ['http'] to skip native fetch) */
|
|
209
|
+
skipEngines?: EngineName[];
|
|
210
|
+
/** Force a specific engine, skipping the cascade */
|
|
211
|
+
forceEngine?: EngineName;
|
|
186
212
|
}
|
|
187
213
|
/**
|
|
188
214
|
* Website metadata extracted from the base page
|
|
@@ -247,17 +273,13 @@ interface Page {
|
|
|
247
273
|
waitTimeMs?: number;
|
|
248
274
|
}
|
|
249
275
|
/**
|
|
250
|
-
* Individual website scrape result
|
|
276
|
+
* Individual website scrape result
|
|
251
277
|
*/
|
|
252
278
|
interface WebsiteScrapeResult {
|
|
253
|
-
/** Markdown
|
|
279
|
+
/** Markdown content (present if 'markdown' in formats) */
|
|
254
280
|
markdown?: string;
|
|
255
|
-
/** HTML
|
|
281
|
+
/** HTML content (present if 'html' in formats) */
|
|
256
282
|
html?: string;
|
|
257
|
-
/** JSON output (present if 'json' in formats) */
|
|
258
|
-
json?: string;
|
|
259
|
-
/** Plain text output (present if 'text' in formats) */
|
|
260
|
-
text?: string;
|
|
261
283
|
/** Metadata about the scraping operation */
|
|
262
284
|
metadata: {
|
|
263
285
|
/** Base URL that was scraped */
|
|
@@ -306,18 +328,22 @@ interface ScrapeResult {
|
|
|
306
328
|
/**
|
|
307
329
|
* Default scrape options
|
|
308
330
|
*/
|
|
309
|
-
declare const DEFAULT_OPTIONS: Omit<Required<ScrapeOptions>, "proxy" | "waitForSelector" | "connectionToCore" | "userAgent" | "browserPool" | "pool"> & {
|
|
331
|
+
declare const DEFAULT_OPTIONS: Omit<Required<ScrapeOptions>, "proxy" | "waitForSelector" | "connectionToCore" | "userAgent" | "headers" | "browserPool" | "pool" | "engines" | "skipEngines" | "forceEngine"> & {
|
|
310
332
|
proxy?: ProxyConfig;
|
|
311
333
|
waitForSelector?: string;
|
|
312
334
|
connectionToCore?: any;
|
|
313
335
|
userAgent?: string;
|
|
336
|
+
headers?: Record<string, string>;
|
|
314
337
|
browserPool?: BrowserPoolConfig;
|
|
315
338
|
pool?: IBrowserPool;
|
|
339
|
+
engines?: EngineName[];
|
|
340
|
+
skipEngines?: EngineName[];
|
|
341
|
+
forceEngine?: EngineName;
|
|
316
342
|
};
|
|
317
343
|
/**
|
|
318
344
|
* Format type guard
|
|
319
345
|
*/
|
|
320
|
-
declare function isValidFormat(format: string): format is "markdown" | "html"
|
|
346
|
+
declare function isValidFormat(format: string): format is "markdown" | "html";
|
|
321
347
|
/**
|
|
322
348
|
* Check if a URL should be crawled based on base domain
|
|
323
349
|
*/
|
|
@@ -343,8 +369,8 @@ interface CrawlOptions {
|
|
|
343
369
|
includePatterns?: string[];
|
|
344
370
|
/** URL patterns to exclude (regex strings) - matching URLs are skipped */
|
|
345
371
|
excludePatterns?: string[];
|
|
346
|
-
/** Output formats for scraped content (default: ['markdown'
|
|
347
|
-
formats?: Array<"markdown" | "html"
|
|
372
|
+
/** Output formats for scraped content (default: ['markdown']) */
|
|
373
|
+
formats?: Array<"markdown" | "html">;
|
|
348
374
|
/** Number of URLs to scrape in parallel (default: 2) */
|
|
349
375
|
scrapeConcurrency?: number;
|
|
350
376
|
/** Remove ads and tracking elements (default: true) */
|
|
@@ -547,7 +573,6 @@ declare class ReaderClient {
|
|
|
547
573
|
*/
|
|
548
574
|
declare class Scraper {
|
|
549
575
|
private options;
|
|
550
|
-
private pool;
|
|
551
576
|
private logger;
|
|
552
577
|
private robotsCache;
|
|
553
578
|
constructor(options: ScrapeOptions);
|
|
@@ -570,13 +595,7 @@ declare class Scraper {
|
|
|
570
595
|
*/
|
|
571
596
|
private scrapeSingleUrlWithRetry;
|
|
572
597
|
/**
|
|
573
|
-
*
|
|
574
|
-
* Cloudflare often does silent redirects even when bypassed, we need to ensure
|
|
575
|
-
* we're on the actual content page before scraping.
|
|
576
|
-
*/
|
|
577
|
-
private waitForFinalPage;
|
|
578
|
-
/**
|
|
579
|
-
* Scrape a single URL
|
|
598
|
+
* Scrape a single URL using the engine orchestrator
|
|
580
599
|
*/
|
|
581
600
|
private scrapeSingleUrl;
|
|
582
601
|
/**
|
|
@@ -828,31 +847,31 @@ declare class DaemonClient {
|
|
|
828
847
|
declare function isDaemonRunning(port?: number): Promise<boolean>;
|
|
829
848
|
|
|
830
849
|
/**
|
|
831
|
-
* Convert
|
|
850
|
+
* Convert HTML to Markdown
|
|
851
|
+
*
|
|
852
|
+
* Simple conversion without any headers, metadata, or formatting wrappers.
|
|
853
|
+
* Returns clean markdown content ready for LLM consumption.
|
|
832
854
|
*/
|
|
833
|
-
declare function
|
|
834
|
-
|
|
855
|
+
declare function htmlToMarkdown(html: string): string;
|
|
835
856
|
/**
|
|
836
|
-
*
|
|
857
|
+
* Alias for htmlToMarkdown (backward compatibility)
|
|
837
858
|
*/
|
|
838
|
-
declare
|
|
859
|
+
declare const formatToMarkdown: typeof htmlToMarkdown;
|
|
839
860
|
|
|
840
861
|
/**
|
|
841
|
-
*
|
|
842
|
-
|
|
843
|
-
|
|
844
|
-
|
|
845
|
-
*
|
|
862
|
+
* HTML formatter
|
|
863
|
+
*
|
|
864
|
+
* Returns the cleaned HTML content as-is.
|
|
865
|
+
* The content has already been processed by content-cleaner.ts
|
|
866
|
+
* (ads removed, base64 images stripped, scripts/styles removed).
|
|
846
867
|
*/
|
|
847
|
-
declare function formatToJsonLite(pages: Page[], baseUrl: string, scrapedAt: string, duration: number, website: WebsiteMetadata): string;
|
|
848
|
-
|
|
849
868
|
/**
|
|
850
|
-
*
|
|
869
|
+
* Return HTML content as-is (already cleaned by content-cleaner)
|
|
851
870
|
*
|
|
852
|
-
*
|
|
853
|
-
*
|
|
871
|
+
* This is essentially a pass-through. The cleaning happens in scraper.ts
|
|
872
|
+
* via cleanContent() before this is called.
|
|
854
873
|
*/
|
|
855
|
-
declare function
|
|
874
|
+
declare function formatToHTML(html: string): string;
|
|
856
875
|
|
|
857
876
|
/**
|
|
858
877
|
* Extract comprehensive website metadata from HTML content
|
|
@@ -862,6 +881,13 @@ declare function extractMetadata(html: string, baseUrl: string): WebsiteMetadata
|
|
|
862
881
|
|
|
863
882
|
/**
|
|
864
883
|
* HTML content cleaning utilities using DOM parsing
|
|
884
|
+
*
|
|
885
|
+
* Layered extraction strategy:
|
|
886
|
+
* 1. Remove scripts, styles, hidden elements (always safe)
|
|
887
|
+
* 2. Remove overlays/modals (always safe)
|
|
888
|
+
* 3. Remove ads (if enabled)
|
|
889
|
+
* 4. Remove navigation with protection (check each element before removing)
|
|
890
|
+
* 5. Find and isolate main content
|
|
865
891
|
*/
|
|
866
892
|
/**
|
|
867
893
|
* Content cleaning options
|
|
@@ -871,9 +897,15 @@ interface CleaningOptions {
|
|
|
871
897
|
removeAds?: boolean;
|
|
872
898
|
/** Remove base64-encoded images (default: true) */
|
|
873
899
|
removeBase64Images?: boolean;
|
|
900
|
+
/** Extract only main content, removing nav/header/footer/sidebar (default: true) */
|
|
901
|
+
onlyMainContent?: boolean;
|
|
902
|
+
/** CSS selectors for elements to include (if set, only these elements are kept) */
|
|
903
|
+
includeTags?: string[];
|
|
904
|
+
/** CSS selectors for elements to exclude (removed from output) */
|
|
905
|
+
excludeTags?: string[];
|
|
874
906
|
}
|
|
875
907
|
/**
|
|
876
|
-
*
|
|
908
|
+
* Main export - clean HTML content
|
|
877
909
|
*/
|
|
878
910
|
declare function cleanContent(html: string, baseUrl: string, options?: CleaningOptions): string;
|
|
879
911
|
|
|
@@ -895,6 +927,14 @@ declare function isValidUrl(string: string): boolean;
|
|
|
895
927
|
declare function isSameDomain(url: string, baseUrl: string): boolean;
|
|
896
928
|
/**
|
|
897
929
|
* Generate a URL key for deduplication
|
|
930
|
+
* Normalizes:
|
|
931
|
+
* - Removes fragments (hash)
|
|
932
|
+
* - Removes search params
|
|
933
|
+
* - Removes trailing slashes (except root)
|
|
934
|
+
* - Lowercases
|
|
935
|
+
* - Normalizes www vs non-www
|
|
936
|
+
* - Removes default ports (80 for http, 443 for https)
|
|
937
|
+
* - Normalizes index files (index.html, index.htm, default.html)
|
|
898
938
|
*/
|
|
899
939
|
declare function getUrlKey(url: string): string;
|
|
900
940
|
/**
|
|
@@ -1083,18 +1123,15 @@ interface ChallengeWaitOptions {
|
|
|
1083
1123
|
/**
|
|
1084
1124
|
* Detect if current page is a Cloudflare challenge
|
|
1085
1125
|
*
|
|
1086
|
-
* Uses multi-signal approach
|
|
1087
|
-
*
|
|
1126
|
+
* Uses multi-signal approach requiring BOTH:
|
|
1127
|
+
* 1. Cloudflare infrastructure indicators (cdn-cgi, cf-ray, etc.)
|
|
1128
|
+
* 2. Challenge-specific elements or text
|
|
1129
|
+
*
|
|
1130
|
+
* This prevents false positives on login pages or other sites
|
|
1131
|
+
* that happen to use similar text.
|
|
1088
1132
|
*
|
|
1089
1133
|
* @param hero - Hero instance with loaded page
|
|
1090
1134
|
* @returns Detection result with confidence score and signals
|
|
1091
|
-
*
|
|
1092
|
-
* @example
|
|
1093
|
-
* const detection = await detectChallenge(hero);
|
|
1094
|
-
* if (detection.isChallenge) {
|
|
1095
|
-
* console.log(`Challenge detected: ${detection.type}`);
|
|
1096
|
-
* console.log(`Signals: ${detection.signals.join(', ')}`);
|
|
1097
|
-
* }
|
|
1098
1135
|
*/
|
|
1099
1136
|
declare function detectChallenge(hero: Hero): Promise<ChallengeDetection>;
|
|
1100
1137
|
/**
|
|
@@ -1213,4 +1250,150 @@ declare function createProxyUrl(config: ProxyConfig): string;
|
|
|
1213
1250
|
*/
|
|
1214
1251
|
declare function parseProxyUrl(url: string): ProxyConfig;
|
|
1215
1252
|
|
|
1216
|
-
|
|
1253
|
+
/**
|
|
1254
|
+
* Typed error classes for Reader
|
|
1255
|
+
*
|
|
1256
|
+
* Provides actionable error messages and structured error information
|
|
1257
|
+
* for better debugging and error handling.
|
|
1258
|
+
*/
|
|
1259
|
+
/**
|
|
1260
|
+
* Error codes for categorization
|
|
1261
|
+
*/
|
|
1262
|
+
declare enum ReaderErrorCode {
|
|
1263
|
+
NETWORK_ERROR = "NETWORK_ERROR",
|
|
1264
|
+
TIMEOUT = "TIMEOUT",
|
|
1265
|
+
CONNECTION_REFUSED = "CONNECTION_REFUSED",
|
|
1266
|
+
CLOUDFLARE_CHALLENGE = "CLOUDFLARE_CHALLENGE",
|
|
1267
|
+
BOT_DETECTED = "BOT_DETECTED",
|
|
1268
|
+
ACCESS_DENIED = "ACCESS_DENIED",
|
|
1269
|
+
CONTENT_EXTRACTION_FAILED = "CONTENT_EXTRACTION_FAILED",
|
|
1270
|
+
EMPTY_CONTENT = "EMPTY_CONTENT",
|
|
1271
|
+
INVALID_URL = "INVALID_URL",
|
|
1272
|
+
INVALID_OPTIONS = "INVALID_OPTIONS",
|
|
1273
|
+
ROBOTS_BLOCKED = "ROBOTS_BLOCKED",
|
|
1274
|
+
BROWSER_ERROR = "BROWSER_ERROR",
|
|
1275
|
+
POOL_EXHAUSTED = "POOL_EXHAUSTED",
|
|
1276
|
+
CLIENT_CLOSED = "CLIENT_CLOSED",
|
|
1277
|
+
NOT_INITIALIZED = "NOT_INITIALIZED",
|
|
1278
|
+
UNKNOWN = "UNKNOWN"
|
|
1279
|
+
}
|
|
1280
|
+
/**
|
|
1281
|
+
* Base error class for all Reader errors
|
|
1282
|
+
*/
|
|
1283
|
+
declare class ReaderError extends Error {
|
|
1284
|
+
readonly code: ReaderErrorCode;
|
|
1285
|
+
readonly url?: string;
|
|
1286
|
+
readonly cause?: Error;
|
|
1287
|
+
readonly timestamp: string;
|
|
1288
|
+
readonly retryable: boolean;
|
|
1289
|
+
constructor(message: string, code: ReaderErrorCode, options?: {
|
|
1290
|
+
url?: string;
|
|
1291
|
+
cause?: Error;
|
|
1292
|
+
retryable?: boolean;
|
|
1293
|
+
});
|
|
1294
|
+
/**
|
|
1295
|
+
* Convert to a plain object for serialization
|
|
1296
|
+
*/
|
|
1297
|
+
toJSON(): Record<string, unknown>;
|
|
1298
|
+
}
|
|
1299
|
+
/**
|
|
1300
|
+
* Network-related errors (connection issues, DNS failures, etc.)
|
|
1301
|
+
*/
|
|
1302
|
+
declare class NetworkError extends ReaderError {
|
|
1303
|
+
constructor(message: string, options?: {
|
|
1304
|
+
url?: string;
|
|
1305
|
+
cause?: Error;
|
|
1306
|
+
});
|
|
1307
|
+
}
|
|
1308
|
+
/**
|
|
1309
|
+
* Timeout errors (page load, navigation, etc.)
|
|
1310
|
+
*/
|
|
1311
|
+
declare class TimeoutError extends ReaderError {
|
|
1312
|
+
readonly timeoutMs: number;
|
|
1313
|
+
constructor(message: string, timeoutMs: number, options?: {
|
|
1314
|
+
url?: string;
|
|
1315
|
+
cause?: Error;
|
|
1316
|
+
});
|
|
1317
|
+
toJSON(): Record<string, unknown>;
|
|
1318
|
+
}
|
|
1319
|
+
/**
|
|
1320
|
+
* Cloudflare challenge errors
|
|
1321
|
+
*/
|
|
1322
|
+
declare class CloudflareError extends ReaderError {
|
|
1323
|
+
readonly challengeType: string;
|
|
1324
|
+
constructor(challengeType: string, options?: {
|
|
1325
|
+
url?: string;
|
|
1326
|
+
cause?: Error;
|
|
1327
|
+
});
|
|
1328
|
+
toJSON(): Record<string, unknown>;
|
|
1329
|
+
}
|
|
1330
|
+
/**
|
|
1331
|
+
* Access denied errors (blocked, forbidden, etc.)
|
|
1332
|
+
*/
|
|
1333
|
+
declare class AccessDeniedError extends ReaderError {
|
|
1334
|
+
readonly statusCode?: number;
|
|
1335
|
+
constructor(message: string, options?: {
|
|
1336
|
+
url?: string;
|
|
1337
|
+
statusCode?: number;
|
|
1338
|
+
cause?: Error;
|
|
1339
|
+
});
|
|
1340
|
+
toJSON(): Record<string, unknown>;
|
|
1341
|
+
}
|
|
1342
|
+
/**
|
|
1343
|
+
* Content extraction errors
|
|
1344
|
+
*/
|
|
1345
|
+
declare class ContentExtractionError extends ReaderError {
|
|
1346
|
+
constructor(message: string, options?: {
|
|
1347
|
+
url?: string;
|
|
1348
|
+
cause?: Error;
|
|
1349
|
+
});
|
|
1350
|
+
}
|
|
1351
|
+
/**
|
|
1352
|
+
* Validation errors (invalid URLs, options, etc.)
|
|
1353
|
+
*/
|
|
1354
|
+
declare class ValidationError extends ReaderError {
|
|
1355
|
+
readonly field?: string;
|
|
1356
|
+
constructor(message: string, options?: {
|
|
1357
|
+
field?: string;
|
|
1358
|
+
url?: string;
|
|
1359
|
+
});
|
|
1360
|
+
toJSON(): Record<string, unknown>;
|
|
1361
|
+
}
|
|
1362
|
+
/**
|
|
1363
|
+
* URL validation error
|
|
1364
|
+
*/
|
|
1365
|
+
declare class InvalidUrlError extends ReaderError {
|
|
1366
|
+
constructor(url: string, reason?: string);
|
|
1367
|
+
}
|
|
1368
|
+
/**
|
|
1369
|
+
* Robots.txt blocked error
|
|
1370
|
+
*/
|
|
1371
|
+
declare class RobotsBlockedError extends ReaderError {
|
|
1372
|
+
constructor(url: string);
|
|
1373
|
+
}
|
|
1374
|
+
/**
|
|
1375
|
+
* Browser pool errors
|
|
1376
|
+
*/
|
|
1377
|
+
declare class BrowserPoolError extends ReaderError {
|
|
1378
|
+
constructor(message: string, options?: {
|
|
1379
|
+
cause?: Error;
|
|
1380
|
+
});
|
|
1381
|
+
}
|
|
1382
|
+
/**
|
|
1383
|
+
* Client state errors
|
|
1384
|
+
*/
|
|
1385
|
+
declare class ClientClosedError extends ReaderError {
|
|
1386
|
+
constructor();
|
|
1387
|
+
}
|
|
1388
|
+
/**
|
|
1389
|
+
* Not initialized error
|
|
1390
|
+
*/
|
|
1391
|
+
declare class NotInitializedError extends ReaderError {
|
|
1392
|
+
constructor(component: string);
|
|
1393
|
+
}
|
|
1394
|
+
/**
|
|
1395
|
+
* Helper to wrap unknown errors in ReaderError
|
|
1396
|
+
*/
|
|
1397
|
+
declare function wrapError(error: unknown, url?: string): ReaderError;
|
|
1398
|
+
|
|
1399
|
+
export { AccessDeniedError, type BatchMetadata, type BrowserInstance, BrowserPool, type BrowserPoolConfig, BrowserPoolError, type ChallengeDetection, type ChallengeResolutionResult, type ChallengeWaitOptions, ClientClosedError, CloudflareError, ContentExtractionError, type CrawlMetadata, type CrawlOptions, type CrawlResult, type CrawlUrl, Crawler, DEFAULT_DAEMON_PORT, DEFAULT_OPTIONS, DaemonClient, type DaemonClientOptions, DaemonServer, type DaemonServerOptions, type DaemonStatus, type HealthStatus, BrowserPool as HeroBrowserPool, type IBrowserPool, InvalidUrlError, NetworkError, NotInitializedError, type Page, type PoolConfig, type PoolStats, type ProxyConfig, type ProxyMetadata, type ProxyRotation, ReaderClient, type ReaderClientOptions, ReaderError, ReaderErrorCode, RobotsBlockedError, type ScrapeOptions, type ScrapeResult, Scraper, TimeoutError, ValidationError, type WebsiteMetadata, type WebsiteScrapeResult, cleanContent, crawl, createHeroConfig, createProxyUrl, detectChallenge, extractMetadata, formatToHTML, formatToMarkdown, getDaemonInfo, getPidFilePath, getUrlKey, handleChallenge, htmlToMarkdown, isChallengePage, isDaemonRunning, isSameDomain, isValidFormat, isValidUrl, parseProxyUrl, rateLimit, resolveUrl, scrape, shouldCrawlUrl, shouldCrawlUrl$1 as shouldCrawlUrlFn, validateUrls, waitForChallengeResolution, waitForSelector, wrapError };
|