@vakra-dev/reader 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +29 -26
- package/dist/cli/index.js +429 -733
- package/dist/cli/index.js.map +1 -1
- package/dist/index.d.ts +205 -41
- package/dist/index.js +646 -714
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.d.ts
CHANGED
|
@@ -139,10 +139,8 @@ interface BrowserPoolConfig {
|
|
|
139
139
|
interface ScrapeOptions {
|
|
140
140
|
/** Array of URLs to scrape */
|
|
141
141
|
urls: string[];
|
|
142
|
-
/** Output formats (default: ['markdown']) */
|
|
143
|
-
formats?: Array<"markdown" | "html"
|
|
144
|
-
/** Include URL, title, timestamp (default: true) */
|
|
145
|
-
includeMetadata?: boolean;
|
|
142
|
+
/** Output formats - which content fields to include (default: ['markdown']) */
|
|
143
|
+
formats?: Array<"markdown" | "html">;
|
|
146
144
|
/** Custom user agent string */
|
|
147
145
|
userAgent?: string;
|
|
148
146
|
/** Request timeout in milliseconds (default: 30000) */
|
|
@@ -155,6 +153,12 @@ interface ScrapeOptions {
|
|
|
155
153
|
removeAds?: boolean;
|
|
156
154
|
/** Remove base64-encoded images to reduce output size (default: true) */
|
|
157
155
|
removeBase64Images?: boolean;
|
|
156
|
+
/** Extract only main content, removing nav/header/footer/sidebar (default: true) */
|
|
157
|
+
onlyMainContent?: boolean;
|
|
158
|
+
/** CSS selectors for elements to include (if set, only these elements are kept) */
|
|
159
|
+
includeTags?: string[];
|
|
160
|
+
/** CSS selectors for elements to exclude (removed from output) */
|
|
161
|
+
excludeTags?: string[];
|
|
158
162
|
/** Skip TLS/SSL certificate verification (default: true) */
|
|
159
163
|
skipTLSVerification?: boolean;
|
|
160
164
|
/** Number of URLs to process in parallel (default: 1 - sequential) */
|
|
@@ -247,17 +251,13 @@ interface Page {
|
|
|
247
251
|
waitTimeMs?: number;
|
|
248
252
|
}
|
|
249
253
|
/**
|
|
250
|
-
* Individual website scrape result
|
|
254
|
+
* Individual website scrape result
|
|
251
255
|
*/
|
|
252
256
|
interface WebsiteScrapeResult {
|
|
253
|
-
/** Markdown
|
|
257
|
+
/** Markdown content (present if 'markdown' in formats) */
|
|
254
258
|
markdown?: string;
|
|
255
|
-
/** HTML
|
|
259
|
+
/** HTML content (present if 'html' in formats) */
|
|
256
260
|
html?: string;
|
|
257
|
-
/** JSON output (present if 'json' in formats) */
|
|
258
|
-
json?: string;
|
|
259
|
-
/** Plain text output (present if 'text' in formats) */
|
|
260
|
-
text?: string;
|
|
261
261
|
/** Metadata about the scraping operation */
|
|
262
262
|
metadata: {
|
|
263
263
|
/** Base URL that was scraped */
|
|
@@ -317,7 +317,7 @@ declare const DEFAULT_OPTIONS: Omit<Required<ScrapeOptions>, "proxy" | "waitForS
|
|
|
317
317
|
/**
|
|
318
318
|
* Format type guard
|
|
319
319
|
*/
|
|
320
|
-
declare function isValidFormat(format: string): format is "markdown" | "html"
|
|
320
|
+
declare function isValidFormat(format: string): format is "markdown" | "html";
|
|
321
321
|
/**
|
|
322
322
|
* Check if a URL should be crawled based on base domain
|
|
323
323
|
*/
|
|
@@ -343,8 +343,8 @@ interface CrawlOptions {
|
|
|
343
343
|
includePatterns?: string[];
|
|
344
344
|
/** URL patterns to exclude (regex strings) - matching URLs are skipped */
|
|
345
345
|
excludePatterns?: string[];
|
|
346
|
-
/** Output formats for scraped content (default: ['markdown'
|
|
347
|
-
formats?: Array<"markdown" | "html"
|
|
346
|
+
/** Output formats for scraped content (default: ['markdown']) */
|
|
347
|
+
formats?: Array<"markdown" | "html">;
|
|
348
348
|
/** Number of URLs to scrape in parallel (default: 2) */
|
|
349
349
|
scrapeConcurrency?: number;
|
|
350
350
|
/** Remove ads and tracking elements (default: true) */
|
|
@@ -828,31 +828,31 @@ declare class DaemonClient {
|
|
|
828
828
|
declare function isDaemonRunning(port?: number): Promise<boolean>;
|
|
829
829
|
|
|
830
830
|
/**
|
|
831
|
-
* Convert
|
|
831
|
+
* Convert HTML to Markdown
|
|
832
|
+
*
|
|
833
|
+
* Simple conversion without any headers, metadata, or formatting wrappers.
|
|
834
|
+
* Returns clean markdown content ready for LLM consumption.
|
|
832
835
|
*/
|
|
833
|
-
declare function
|
|
834
|
-
|
|
836
|
+
declare function htmlToMarkdown(html: string): string;
|
|
835
837
|
/**
|
|
836
|
-
*
|
|
838
|
+
* Alias for htmlToMarkdown (backward compatibility)
|
|
837
839
|
*/
|
|
838
|
-
declare
|
|
840
|
+
declare const formatToMarkdown: typeof htmlToMarkdown;
|
|
839
841
|
|
|
840
842
|
/**
|
|
841
|
-
*
|
|
842
|
-
|
|
843
|
-
|
|
844
|
-
|
|
845
|
-
*
|
|
843
|
+
* HTML formatter
|
|
844
|
+
*
|
|
845
|
+
* Returns the cleaned HTML content as-is.
|
|
846
|
+
* The content has already been processed by content-cleaner.ts
|
|
847
|
+
* (ads removed, base64 images stripped, scripts/styles removed).
|
|
846
848
|
*/
|
|
847
|
-
declare function formatToJsonLite(pages: Page[], baseUrl: string, scrapedAt: string, duration: number, website: WebsiteMetadata): string;
|
|
848
|
-
|
|
849
849
|
/**
|
|
850
|
-
*
|
|
850
|
+
* Return HTML content as-is (already cleaned by content-cleaner)
|
|
851
851
|
*
|
|
852
|
-
*
|
|
853
|
-
*
|
|
852
|
+
* This is essentially a pass-through. The cleaning happens in scraper.ts
|
|
853
|
+
* via cleanContent() before this is called.
|
|
854
854
|
*/
|
|
855
|
-
declare function
|
|
855
|
+
declare function formatToHTML(html: string): string;
|
|
856
856
|
|
|
857
857
|
/**
|
|
858
858
|
* Extract comprehensive website metadata from HTML content
|
|
@@ -862,6 +862,13 @@ declare function extractMetadata(html: string, baseUrl: string): WebsiteMetadata
|
|
|
862
862
|
|
|
863
863
|
/**
|
|
864
864
|
* HTML content cleaning utilities using DOM parsing
|
|
865
|
+
*
|
|
866
|
+
* Layered extraction strategy:
|
|
867
|
+
* 1. Remove scripts, styles, hidden elements (always safe)
|
|
868
|
+
* 2. Remove overlays/modals (always safe)
|
|
869
|
+
* 3. Remove ads (if enabled)
|
|
870
|
+
* 4. Remove navigation with protection (check each element before removing)
|
|
871
|
+
* 5. Find and isolate main content
|
|
865
872
|
*/
|
|
866
873
|
/**
|
|
867
874
|
* Content cleaning options
|
|
@@ -871,9 +878,15 @@ interface CleaningOptions {
|
|
|
871
878
|
removeAds?: boolean;
|
|
872
879
|
/** Remove base64-encoded images (default: true) */
|
|
873
880
|
removeBase64Images?: boolean;
|
|
881
|
+
/** Extract only main content, removing nav/header/footer/sidebar (default: true) */
|
|
882
|
+
onlyMainContent?: boolean;
|
|
883
|
+
/** CSS selectors for elements to include (if set, only these elements are kept) */
|
|
884
|
+
includeTags?: string[];
|
|
885
|
+
/** CSS selectors for elements to exclude (removed from output) */
|
|
886
|
+
excludeTags?: string[];
|
|
874
887
|
}
|
|
875
888
|
/**
|
|
876
|
-
*
|
|
889
|
+
* Main export - clean HTML content
|
|
877
890
|
*/
|
|
878
891
|
declare function cleanContent(html: string, baseUrl: string, options?: CleaningOptions): string;
|
|
879
892
|
|
|
@@ -895,6 +908,14 @@ declare function isValidUrl(string: string): boolean;
|
|
|
895
908
|
declare function isSameDomain(url: string, baseUrl: string): boolean;
|
|
896
909
|
/**
|
|
897
910
|
* Generate a URL key for deduplication
|
|
911
|
+
* Normalizes:
|
|
912
|
+
* - Removes fragments (hash)
|
|
913
|
+
* - Removes search params
|
|
914
|
+
* - Removes trailing slashes (except root)
|
|
915
|
+
* - Lowercases
|
|
916
|
+
* - Normalizes www vs non-www
|
|
917
|
+
* - Removes default ports (80 for http, 443 for https)
|
|
918
|
+
* - Normalizes index files (index.html, index.htm, default.html)
|
|
898
919
|
*/
|
|
899
920
|
declare function getUrlKey(url: string): string;
|
|
900
921
|
/**
|
|
@@ -1083,18 +1104,15 @@ interface ChallengeWaitOptions {
|
|
|
1083
1104
|
/**
|
|
1084
1105
|
* Detect if current page is a Cloudflare challenge
|
|
1085
1106
|
*
|
|
1086
|
-
* Uses multi-signal approach
|
|
1087
|
-
*
|
|
1107
|
+
* Uses multi-signal approach requiring BOTH:
|
|
1108
|
+
* 1. Cloudflare infrastructure indicators (cdn-cgi, cf-ray, etc.)
|
|
1109
|
+
* 2. Challenge-specific elements or text
|
|
1110
|
+
*
|
|
1111
|
+
* This prevents false positives on login pages or other sites
|
|
1112
|
+
* that happen to use similar text.
|
|
1088
1113
|
*
|
|
1089
1114
|
* @param hero - Hero instance with loaded page
|
|
1090
1115
|
* @returns Detection result with confidence score and signals
|
|
1091
|
-
*
|
|
1092
|
-
* @example
|
|
1093
|
-
* const detection = await detectChallenge(hero);
|
|
1094
|
-
* if (detection.isChallenge) {
|
|
1095
|
-
* console.log(`Challenge detected: ${detection.type}`);
|
|
1096
|
-
* console.log(`Signals: ${detection.signals.join(', ')}`);
|
|
1097
|
-
* }
|
|
1098
1116
|
*/
|
|
1099
1117
|
declare function detectChallenge(hero: Hero): Promise<ChallengeDetection>;
|
|
1100
1118
|
/**
|
|
@@ -1213,4 +1231,150 @@ declare function createProxyUrl(config: ProxyConfig): string;
|
|
|
1213
1231
|
*/
|
|
1214
1232
|
declare function parseProxyUrl(url: string): ProxyConfig;
|
|
1215
1233
|
|
|
1216
|
-
|
|
1234
|
+
/**
|
|
1235
|
+
* Typed error classes for Reader
|
|
1236
|
+
*
|
|
1237
|
+
* Provides actionable error messages and structured error information
|
|
1238
|
+
* for better debugging and error handling.
|
|
1239
|
+
*/
|
|
1240
|
+
/**
|
|
1241
|
+
* Error codes for categorization
|
|
1242
|
+
*/
|
|
1243
|
+
declare enum ReaderErrorCode {
|
|
1244
|
+
NETWORK_ERROR = "NETWORK_ERROR",
|
|
1245
|
+
TIMEOUT = "TIMEOUT",
|
|
1246
|
+
CONNECTION_REFUSED = "CONNECTION_REFUSED",
|
|
1247
|
+
CLOUDFLARE_CHALLENGE = "CLOUDFLARE_CHALLENGE",
|
|
1248
|
+
BOT_DETECTED = "BOT_DETECTED",
|
|
1249
|
+
ACCESS_DENIED = "ACCESS_DENIED",
|
|
1250
|
+
CONTENT_EXTRACTION_FAILED = "CONTENT_EXTRACTION_FAILED",
|
|
1251
|
+
EMPTY_CONTENT = "EMPTY_CONTENT",
|
|
1252
|
+
INVALID_URL = "INVALID_URL",
|
|
1253
|
+
INVALID_OPTIONS = "INVALID_OPTIONS",
|
|
1254
|
+
ROBOTS_BLOCKED = "ROBOTS_BLOCKED",
|
|
1255
|
+
BROWSER_ERROR = "BROWSER_ERROR",
|
|
1256
|
+
POOL_EXHAUSTED = "POOL_EXHAUSTED",
|
|
1257
|
+
CLIENT_CLOSED = "CLIENT_CLOSED",
|
|
1258
|
+
NOT_INITIALIZED = "NOT_INITIALIZED",
|
|
1259
|
+
UNKNOWN = "UNKNOWN"
|
|
1260
|
+
}
|
|
1261
|
+
/**
|
|
1262
|
+
* Base error class for all Reader errors
|
|
1263
|
+
*/
|
|
1264
|
+
declare class ReaderError extends Error {
|
|
1265
|
+
readonly code: ReaderErrorCode;
|
|
1266
|
+
readonly url?: string;
|
|
1267
|
+
readonly cause?: Error;
|
|
1268
|
+
readonly timestamp: string;
|
|
1269
|
+
readonly retryable: boolean;
|
|
1270
|
+
constructor(message: string, code: ReaderErrorCode, options?: {
|
|
1271
|
+
url?: string;
|
|
1272
|
+
cause?: Error;
|
|
1273
|
+
retryable?: boolean;
|
|
1274
|
+
});
|
|
1275
|
+
/**
|
|
1276
|
+
* Convert to a plain object for serialization
|
|
1277
|
+
*/
|
|
1278
|
+
toJSON(): Record<string, unknown>;
|
|
1279
|
+
}
|
|
1280
|
+
/**
|
|
1281
|
+
* Network-related errors (connection issues, DNS failures, etc.)
|
|
1282
|
+
*/
|
|
1283
|
+
declare class NetworkError extends ReaderError {
|
|
1284
|
+
constructor(message: string, options?: {
|
|
1285
|
+
url?: string;
|
|
1286
|
+
cause?: Error;
|
|
1287
|
+
});
|
|
1288
|
+
}
|
|
1289
|
+
/**
|
|
1290
|
+
* Timeout errors (page load, navigation, etc.)
|
|
1291
|
+
*/
|
|
1292
|
+
declare class TimeoutError extends ReaderError {
|
|
1293
|
+
readonly timeoutMs: number;
|
|
1294
|
+
constructor(message: string, timeoutMs: number, options?: {
|
|
1295
|
+
url?: string;
|
|
1296
|
+
cause?: Error;
|
|
1297
|
+
});
|
|
1298
|
+
toJSON(): Record<string, unknown>;
|
|
1299
|
+
}
|
|
1300
|
+
/**
|
|
1301
|
+
* Cloudflare challenge errors
|
|
1302
|
+
*/
|
|
1303
|
+
declare class CloudflareError extends ReaderError {
|
|
1304
|
+
readonly challengeType: string;
|
|
1305
|
+
constructor(challengeType: string, options?: {
|
|
1306
|
+
url?: string;
|
|
1307
|
+
cause?: Error;
|
|
1308
|
+
});
|
|
1309
|
+
toJSON(): Record<string, unknown>;
|
|
1310
|
+
}
|
|
1311
|
+
/**
|
|
1312
|
+
* Access denied errors (blocked, forbidden, etc.)
|
|
1313
|
+
*/
|
|
1314
|
+
declare class AccessDeniedError extends ReaderError {
|
|
1315
|
+
readonly statusCode?: number;
|
|
1316
|
+
constructor(message: string, options?: {
|
|
1317
|
+
url?: string;
|
|
1318
|
+
statusCode?: number;
|
|
1319
|
+
cause?: Error;
|
|
1320
|
+
});
|
|
1321
|
+
toJSON(): Record<string, unknown>;
|
|
1322
|
+
}
|
|
1323
|
+
/**
|
|
1324
|
+
* Content extraction errors
|
|
1325
|
+
*/
|
|
1326
|
+
declare class ContentExtractionError extends ReaderError {
|
|
1327
|
+
constructor(message: string, options?: {
|
|
1328
|
+
url?: string;
|
|
1329
|
+
cause?: Error;
|
|
1330
|
+
});
|
|
1331
|
+
}
|
|
1332
|
+
/**
|
|
1333
|
+
* Validation errors (invalid URLs, options, etc.)
|
|
1334
|
+
*/
|
|
1335
|
+
declare class ValidationError extends ReaderError {
|
|
1336
|
+
readonly field?: string;
|
|
1337
|
+
constructor(message: string, options?: {
|
|
1338
|
+
field?: string;
|
|
1339
|
+
url?: string;
|
|
1340
|
+
});
|
|
1341
|
+
toJSON(): Record<string, unknown>;
|
|
1342
|
+
}
|
|
1343
|
+
/**
|
|
1344
|
+
* URL validation error
|
|
1345
|
+
*/
|
|
1346
|
+
declare class InvalidUrlError extends ReaderError {
|
|
1347
|
+
constructor(url: string, reason?: string);
|
|
1348
|
+
}
|
|
1349
|
+
/**
|
|
1350
|
+
* Robots.txt blocked error
|
|
1351
|
+
*/
|
|
1352
|
+
declare class RobotsBlockedError extends ReaderError {
|
|
1353
|
+
constructor(url: string);
|
|
1354
|
+
}
|
|
1355
|
+
/**
|
|
1356
|
+
* Browser pool errors
|
|
1357
|
+
*/
|
|
1358
|
+
declare class BrowserPoolError extends ReaderError {
|
|
1359
|
+
constructor(message: string, options?: {
|
|
1360
|
+
cause?: Error;
|
|
1361
|
+
});
|
|
1362
|
+
}
|
|
1363
|
+
/**
|
|
1364
|
+
* Client state errors
|
|
1365
|
+
*/
|
|
1366
|
+
declare class ClientClosedError extends ReaderError {
|
|
1367
|
+
constructor();
|
|
1368
|
+
}
|
|
1369
|
+
/**
|
|
1370
|
+
* Not initialized error
|
|
1371
|
+
*/
|
|
1372
|
+
declare class NotInitializedError extends ReaderError {
|
|
1373
|
+
constructor(component: string);
|
|
1374
|
+
}
|
|
1375
|
+
/**
|
|
1376
|
+
* Helper to wrap unknown errors in ReaderError
|
|
1377
|
+
*/
|
|
1378
|
+
declare function wrapError(error: unknown, url?: string): ReaderError;
|
|
1379
|
+
|
|
1380
|
+
export { AccessDeniedError, type BatchMetadata, type BrowserInstance, BrowserPool, type BrowserPoolConfig, BrowserPoolError, type ChallengeDetection, type ChallengeResolutionResult, type ChallengeWaitOptions, ClientClosedError, CloudflareError, ContentExtractionError, type CrawlMetadata, type CrawlOptions, type CrawlResult, type CrawlUrl, Crawler, DEFAULT_DAEMON_PORT, DEFAULT_OPTIONS, DaemonClient, type DaemonClientOptions, DaemonServer, type DaemonServerOptions, type DaemonStatus, type HealthStatus, BrowserPool as HeroBrowserPool, type IBrowserPool, InvalidUrlError, NetworkError, NotInitializedError, type Page, type PoolConfig, type PoolStats, type ProxyConfig, type ProxyMetadata, type ProxyRotation, ReaderClient, type ReaderClientOptions, ReaderError, ReaderErrorCode, RobotsBlockedError, type ScrapeOptions, type ScrapeResult, Scraper, TimeoutError, ValidationError, type WebsiteMetadata, type WebsiteScrapeResult, cleanContent, crawl, createHeroConfig, createProxyUrl, detectChallenge, extractMetadata, formatToHTML, formatToMarkdown, getDaemonInfo, getPidFilePath, getUrlKey, handleChallenge, htmlToMarkdown, isChallengePage, isDaemonRunning, isSameDomain, isValidFormat, isValidUrl, parseProxyUrl, rateLimit, resolveUrl, scrape, shouldCrawlUrl, shouldCrawlUrl$1 as shouldCrawlUrlFn, validateUrls, waitForChallengeResolution, waitForSelector, wrapError };
|