@vakra-dev/reader 0.0.2 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -90,6 +90,20 @@ interface IBrowserPool {
90
90
  healthCheck?(): Promise<HealthStatus>;
91
91
  }
92
92
 
93
+ /**
94
+ * Engine types for multi-engine scraping architecture
95
+ *
96
+ * Engine stack (in order of preference):
97
+ * 1. http - Native fetch, fastest, no browser
98
+ * 2. tlsclient - TLS fingerprinting via got-scraping
99
+ * 3. hero - Full browser with JavaScript execution
100
+ */
101
+
102
+ /**
103
+ * Available engine names
104
+ */
105
+ type EngineName = "http" | "tlsclient" | "hero";
106
+
93
107
  /**
94
108
  * Proxy configuration for Hero
95
109
  */
@@ -139,12 +153,12 @@ interface BrowserPoolConfig {
139
153
  interface ScrapeOptions {
140
154
  /** Array of URLs to scrape */
141
155
  urls: string[];
142
- /** Output formats (default: ['markdown']) */
143
- formats?: Array<"markdown" | "html" | "json" | "text">;
144
- /** Include URL, title, timestamp (default: true) */
145
- includeMetadata?: boolean;
156
+ /** Output formats - which content fields to include (default: ['markdown']) */
157
+ formats?: Array<"markdown" | "html">;
146
158
  /** Custom user agent string */
147
159
  userAgent?: string;
160
+ /** Custom headers for requests */
161
+ headers?: Record<string, string>;
148
162
  /** Request timeout in milliseconds (default: 30000) */
149
163
  timeoutMs?: number;
150
164
  /** URL patterns to include (regex strings) */
@@ -155,6 +169,12 @@ interface ScrapeOptions {
155
169
  removeAds?: boolean;
156
170
  /** Remove base64-encoded images to reduce output size (default: true) */
157
171
  removeBase64Images?: boolean;
172
+ /** Extract only main content, removing nav/header/footer/sidebar (default: true) */
173
+ onlyMainContent?: boolean;
174
+ /** CSS selectors for elements to include (if set, only these elements are kept) */
175
+ includeTags?: string[];
176
+ /** CSS selectors for elements to exclude (removed from output) */
177
+ excludeTags?: string[];
158
178
  /** Skip TLS/SSL certificate verification (default: true) */
159
179
  skipTLSVerification?: boolean;
160
180
  /** Number of URLs to process in parallel (default: 1 - sequential) */
@@ -183,6 +203,12 @@ interface ScrapeOptions {
183
203
  browserPool?: BrowserPoolConfig;
184
204
  /** Browser pool instance (internal, provided by ReaderClient) */
185
205
  pool?: IBrowserPool;
206
+ /** Engines to use in order (default: ['http', 'tlsclient', 'hero']) */
207
+ engines?: EngineName[];
208
+ /** Skip specific engines (e.g., ['http'] to skip native fetch) */
209
+ skipEngines?: EngineName[];
210
+ /** Force a specific engine, skipping the cascade */
211
+ forceEngine?: EngineName;
186
212
  }
187
213
  /**
188
214
  * Website metadata extracted from the base page
@@ -247,17 +273,13 @@ interface Page {
247
273
  waitTimeMs?: number;
248
274
  }
249
275
  /**
250
- * Individual website scrape result (for backward compatibility)
276
+ * Individual website scrape result
251
277
  */
252
278
  interface WebsiteScrapeResult {
253
- /** Markdown output (present if 'markdown' in formats) */
279
+ /** Markdown content (present if 'markdown' in formats) */
254
280
  markdown?: string;
255
- /** HTML output (present if 'html' in formats) */
281
+ /** HTML content (present if 'html' in formats) */
256
282
  html?: string;
257
- /** JSON output (present if 'json' in formats) */
258
- json?: string;
259
- /** Plain text output (present if 'text' in formats) */
260
- text?: string;
261
283
  /** Metadata about the scraping operation */
262
284
  metadata: {
263
285
  /** Base URL that was scraped */
@@ -306,18 +328,22 @@ interface ScrapeResult {
306
328
  /**
307
329
  * Default scrape options
308
330
  */
309
- declare const DEFAULT_OPTIONS: Omit<Required<ScrapeOptions>, "proxy" | "waitForSelector" | "connectionToCore" | "userAgent" | "browserPool" | "pool"> & {
331
+ declare const DEFAULT_OPTIONS: Omit<Required<ScrapeOptions>, "proxy" | "waitForSelector" | "connectionToCore" | "userAgent" | "headers" | "browserPool" | "pool" | "engines" | "skipEngines" | "forceEngine"> & {
310
332
  proxy?: ProxyConfig;
311
333
  waitForSelector?: string;
312
334
  connectionToCore?: any;
313
335
  userAgent?: string;
336
+ headers?: Record<string, string>;
314
337
  browserPool?: BrowserPoolConfig;
315
338
  pool?: IBrowserPool;
339
+ engines?: EngineName[];
340
+ skipEngines?: EngineName[];
341
+ forceEngine?: EngineName;
316
342
  };
317
343
  /**
318
344
  * Format type guard
319
345
  */
320
- declare function isValidFormat(format: string): format is "markdown" | "html" | "json" | "text";
346
+ declare function isValidFormat(format: string): format is "markdown" | "html";
321
347
  /**
322
348
  * Check if a URL should be crawled based on base domain
323
349
  */
@@ -343,8 +369,8 @@ interface CrawlOptions {
343
369
  includePatterns?: string[];
344
370
  /** URL patterns to exclude (regex strings) - matching URLs are skipped */
345
371
  excludePatterns?: string[];
346
- /** Output formats for scraped content (default: ['markdown', 'html']) */
347
- formats?: Array<"markdown" | "html" | "json" | "text">;
372
+ /** Output formats for scraped content (default: ['markdown']) */
373
+ formats?: Array<"markdown" | "html">;
348
374
  /** Number of URLs to scrape in parallel (default: 2) */
349
375
  scrapeConcurrency?: number;
350
376
  /** Remove ads and tracking elements (default: true) */
@@ -547,7 +573,6 @@ declare class ReaderClient {
547
573
  */
548
574
  declare class Scraper {
549
575
  private options;
550
- private pool;
551
576
  private logger;
552
577
  private robotsCache;
553
578
  constructor(options: ScrapeOptions);
@@ -570,13 +595,7 @@ declare class Scraper {
570
595
  */
571
596
  private scrapeSingleUrlWithRetry;
572
597
  /**
573
- * Wait for the final page to load after any Cloudflare redirects
574
- * Cloudflare often does silent redirects even when bypassed, we need to ensure
575
- * we're on the actual content page before scraping.
576
- */
577
- private waitForFinalPage;
578
- /**
579
- * Scrape a single URL
598
+ * Scrape a single URL using the engine orchestrator
580
599
  */
581
600
  private scrapeSingleUrl;
582
601
  /**
@@ -828,31 +847,31 @@ declare class DaemonClient {
828
847
  declare function isDaemonRunning(port?: number): Promise<boolean>;
829
848
 
830
849
  /**
831
- * Convert pages to consolidated Markdown format
850
+ * Convert HTML to Markdown
851
+ *
852
+ * Simple conversion without any headers, metadata, or formatting wrappers.
853
+ * Returns clean markdown content ready for LLM consumption.
832
854
  */
833
- declare function formatToMarkdown(pages: Page[], baseUrl: string, scrapedAt: string, duration: number, website: WebsiteMetadata, includeMetadata?: boolean): string;
834
-
855
+ declare function htmlToMarkdown(html: string): string;
835
856
  /**
836
- * Convert pages to HTML format with metadata
857
+ * Alias for htmlToMarkdown (backward compatibility)
837
858
  */
838
- declare function formatToHTML(pages: Page[], baseUrl: string, scrapedAt: string, duration: number, website: WebsiteMetadata): string;
859
+ declare const formatToMarkdown: typeof htmlToMarkdown;
839
860
 
840
861
  /**
841
- * Convert pages to JSON format with metadata
842
- */
843
- declare function formatToJson(pages: Page[], baseUrl: string, scrapedAt: string, duration: number, website: WebsiteMetadata): string;
844
- /**
845
- * Convert pages to JSON format without HTML (lighter version)
862
+ * HTML formatter
863
+ *
864
+ * Returns the cleaned HTML content as-is.
865
+ * The content has already been processed by content-cleaner.ts
866
+ * (ads removed, base64 images stripped, scripts/styles removed).
846
867
  */
847
- declare function formatToJsonLite(pages: Page[], baseUrl: string, scrapedAt: string, duration: number, website: WebsiteMetadata): string;
848
-
849
868
  /**
850
- * Convert pages to plain text format
869
+ * Return HTML content as-is (already cleaned by content-cleaner)
851
870
  *
852
- * Strips all HTML tags and formatting, preserving only readable text content.
853
- * Useful for LLM consumption where markdown formatting is not needed.
871
+ * This is essentially a pass-through. The cleaning happens in scraper.ts
872
+ * via cleanContent() before this is called.
854
873
  */
855
- declare function formatToText(pages: Page[], baseUrl: string, scrapedAt: string, duration: number, website: WebsiteMetadata, includeMetadata?: boolean): string;
874
+ declare function formatToHTML(html: string): string;
856
875
 
857
876
  /**
858
877
  * Extract comprehensive website metadata from HTML content
@@ -862,6 +881,13 @@ declare function extractMetadata(html: string, baseUrl: string): WebsiteMetadata
862
881
 
863
882
  /**
864
883
  * HTML content cleaning utilities using DOM parsing
884
+ *
885
+ * Layered extraction strategy:
886
+ * 1. Remove scripts, styles, hidden elements (always safe)
887
+ * 2. Remove overlays/modals (always safe)
888
+ * 3. Remove ads (if enabled)
889
+ * 4. Remove navigation with protection (check each element before removing)
890
+ * 5. Find and isolate main content
865
891
  */
866
892
  /**
867
893
  * Content cleaning options
@@ -871,9 +897,15 @@ interface CleaningOptions {
871
897
  removeAds?: boolean;
872
898
  /** Remove base64-encoded images (default: true) */
873
899
  removeBase64Images?: boolean;
900
+ /** Extract only main content, removing nav/header/footer/sidebar (default: true) */
901
+ onlyMainContent?: boolean;
902
+ /** CSS selectors for elements to include (if set, only these elements are kept) */
903
+ includeTags?: string[];
904
+ /** CSS selectors for elements to exclude (removed from output) */
905
+ excludeTags?: string[];
874
906
  }
875
907
  /**
876
- * Clean HTML content (alias for cleanHtml with options)
908
+ * Main export - clean HTML content
877
909
  */
878
910
  declare function cleanContent(html: string, baseUrl: string, options?: CleaningOptions): string;
879
911
 
@@ -895,6 +927,14 @@ declare function isValidUrl(string: string): boolean;
895
927
  declare function isSameDomain(url: string, baseUrl: string): boolean;
896
928
  /**
897
929
  * Generate a URL key for deduplication
930
+ * Normalizes:
931
+ * - Removes fragments (hash)
932
+ * - Removes search params
933
+ * - Removes trailing slashes (except root)
934
+ * - Lowercases
935
+ * - Normalizes www vs non-www
936
+ * - Removes default ports (80 for http, 443 for https)
937
+ * - Normalizes index files (index.html, index.htm, default.html)
898
938
  */
899
939
  declare function getUrlKey(url: string): string;
900
940
  /**
@@ -1083,18 +1123,15 @@ interface ChallengeWaitOptions {
1083
1123
  /**
1084
1124
  * Detect if current page is a Cloudflare challenge
1085
1125
  *
1086
- * Uses multi-signal approach with ONLY challenge-specific indicators.
1087
- * No content length heuristics to avoid false positives.
1126
+ * Uses multi-signal approach requiring BOTH:
1127
+ * 1. Cloudflare infrastructure indicators (cdn-cgi, cf-ray, etc.)
1128
+ * 2. Challenge-specific elements or text
1129
+ *
1130
+ * This prevents false positives on login pages or other sites
1131
+ * that happen to use similar text.
1088
1132
  *
1089
1133
  * @param hero - Hero instance with loaded page
1090
1134
  * @returns Detection result with confidence score and signals
1091
- *
1092
- * @example
1093
- * const detection = await detectChallenge(hero);
1094
- * if (detection.isChallenge) {
1095
- * console.log(`Challenge detected: ${detection.type}`);
1096
- * console.log(`Signals: ${detection.signals.join(', ')}`);
1097
- * }
1098
1135
  */
1099
1136
  declare function detectChallenge(hero: Hero): Promise<ChallengeDetection>;
1100
1137
  /**
@@ -1213,4 +1250,150 @@ declare function createProxyUrl(config: ProxyConfig): string;
1213
1250
  */
1214
1251
  declare function parseProxyUrl(url: string): ProxyConfig;
1215
1252
 
1216
- export { type BatchMetadata, type BrowserInstance, BrowserPool, type BrowserPoolConfig, type ChallengeDetection, type ChallengeResolutionResult, type ChallengeWaitOptions, type CrawlMetadata, type CrawlOptions, type CrawlResult, type CrawlUrl, Crawler, DEFAULT_DAEMON_PORT, DEFAULT_OPTIONS, DaemonClient, type DaemonClientOptions, DaemonServer, type DaemonServerOptions, type DaemonStatus, type HealthStatus, BrowserPool as HeroBrowserPool, type IBrowserPool, type Page, type PoolConfig, type PoolStats, type ProxyConfig, type ProxyMetadata, type ProxyRotation, ReaderClient, type ReaderClientOptions, type ScrapeOptions, type ScrapeResult, Scraper, type WebsiteMetadata, type WebsiteScrapeResult, cleanContent, crawl, createHeroConfig, createProxyUrl, detectChallenge, extractMetadata, formatToHTML, formatToJson, formatToJsonLite, formatToMarkdown, formatToText, getDaemonInfo, getPidFilePath, getUrlKey, handleChallenge, isChallengePage, isDaemonRunning, isSameDomain, isValidFormat, isValidUrl, parseProxyUrl, rateLimit, resolveUrl, scrape, shouldCrawlUrl, shouldCrawlUrl$1 as shouldCrawlUrlFn, validateUrls, waitForChallengeResolution, waitForSelector };
1253
+ /**
1254
+ * Typed error classes for Reader
1255
+ *
1256
+ * Provides actionable error messages and structured error information
1257
+ * for better debugging and error handling.
1258
+ */
1259
+ /**
1260
+ * Error codes for categorization
1261
+ */
1262
+ declare enum ReaderErrorCode {
1263
+ NETWORK_ERROR = "NETWORK_ERROR",
1264
+ TIMEOUT = "TIMEOUT",
1265
+ CONNECTION_REFUSED = "CONNECTION_REFUSED",
1266
+ CLOUDFLARE_CHALLENGE = "CLOUDFLARE_CHALLENGE",
1267
+ BOT_DETECTED = "BOT_DETECTED",
1268
+ ACCESS_DENIED = "ACCESS_DENIED",
1269
+ CONTENT_EXTRACTION_FAILED = "CONTENT_EXTRACTION_FAILED",
1270
+ EMPTY_CONTENT = "EMPTY_CONTENT",
1271
+ INVALID_URL = "INVALID_URL",
1272
+ INVALID_OPTIONS = "INVALID_OPTIONS",
1273
+ ROBOTS_BLOCKED = "ROBOTS_BLOCKED",
1274
+ BROWSER_ERROR = "BROWSER_ERROR",
1275
+ POOL_EXHAUSTED = "POOL_EXHAUSTED",
1276
+ CLIENT_CLOSED = "CLIENT_CLOSED",
1277
+ NOT_INITIALIZED = "NOT_INITIALIZED",
1278
+ UNKNOWN = "UNKNOWN"
1279
+ }
1280
+ /**
1281
+ * Base error class for all Reader errors
1282
+ */
1283
+ declare class ReaderError extends Error {
1284
+ readonly code: ReaderErrorCode;
1285
+ readonly url?: string;
1286
+ readonly cause?: Error;
1287
+ readonly timestamp: string;
1288
+ readonly retryable: boolean;
1289
+ constructor(message: string, code: ReaderErrorCode, options?: {
1290
+ url?: string;
1291
+ cause?: Error;
1292
+ retryable?: boolean;
1293
+ });
1294
+ /**
1295
+ * Convert to a plain object for serialization
1296
+ */
1297
+ toJSON(): Record<string, unknown>;
1298
+ }
1299
+ /**
1300
+ * Network-related errors (connection issues, DNS failures, etc.)
1301
+ */
1302
+ declare class NetworkError extends ReaderError {
1303
+ constructor(message: string, options?: {
1304
+ url?: string;
1305
+ cause?: Error;
1306
+ });
1307
+ }
1308
+ /**
1309
+ * Timeout errors (page load, navigation, etc.)
1310
+ */
1311
+ declare class TimeoutError extends ReaderError {
1312
+ readonly timeoutMs: number;
1313
+ constructor(message: string, timeoutMs: number, options?: {
1314
+ url?: string;
1315
+ cause?: Error;
1316
+ });
1317
+ toJSON(): Record<string, unknown>;
1318
+ }
1319
+ /**
1320
+ * Cloudflare challenge errors
1321
+ */
1322
+ declare class CloudflareError extends ReaderError {
1323
+ readonly challengeType: string;
1324
+ constructor(challengeType: string, options?: {
1325
+ url?: string;
1326
+ cause?: Error;
1327
+ });
1328
+ toJSON(): Record<string, unknown>;
1329
+ }
1330
+ /**
1331
+ * Access denied errors (blocked, forbidden, etc.)
1332
+ */
1333
+ declare class AccessDeniedError extends ReaderError {
1334
+ readonly statusCode?: number;
1335
+ constructor(message: string, options?: {
1336
+ url?: string;
1337
+ statusCode?: number;
1338
+ cause?: Error;
1339
+ });
1340
+ toJSON(): Record<string, unknown>;
1341
+ }
1342
+ /**
1343
+ * Content extraction errors
1344
+ */
1345
+ declare class ContentExtractionError extends ReaderError {
1346
+ constructor(message: string, options?: {
1347
+ url?: string;
1348
+ cause?: Error;
1349
+ });
1350
+ }
1351
+ /**
1352
+ * Validation errors (invalid URLs, options, etc.)
1353
+ */
1354
+ declare class ValidationError extends ReaderError {
1355
+ readonly field?: string;
1356
+ constructor(message: string, options?: {
1357
+ field?: string;
1358
+ url?: string;
1359
+ });
1360
+ toJSON(): Record<string, unknown>;
1361
+ }
1362
+ /**
1363
+ * URL validation error
1364
+ */
1365
+ declare class InvalidUrlError extends ReaderError {
1366
+ constructor(url: string, reason?: string);
1367
+ }
1368
+ /**
1369
+ * Robots.txt blocked error
1370
+ */
1371
+ declare class RobotsBlockedError extends ReaderError {
1372
+ constructor(url: string);
1373
+ }
1374
+ /**
1375
+ * Browser pool errors
1376
+ */
1377
+ declare class BrowserPoolError extends ReaderError {
1378
+ constructor(message: string, options?: {
1379
+ cause?: Error;
1380
+ });
1381
+ }
1382
+ /**
1383
+ * Client state errors
1384
+ */
1385
+ declare class ClientClosedError extends ReaderError {
1386
+ constructor();
1387
+ }
1388
+ /**
1389
+ * Not initialized error
1390
+ */
1391
+ declare class NotInitializedError extends ReaderError {
1392
+ constructor(component: string);
1393
+ }
1394
+ /**
1395
+ * Helper to wrap unknown errors in ReaderError
1396
+ */
1397
+ declare function wrapError(error: unknown, url?: string): ReaderError;
1398
+
1399
+ export { AccessDeniedError, type BatchMetadata, type BrowserInstance, BrowserPool, type BrowserPoolConfig, BrowserPoolError, type ChallengeDetection, type ChallengeResolutionResult, type ChallengeWaitOptions, ClientClosedError, CloudflareError, ContentExtractionError, type CrawlMetadata, type CrawlOptions, type CrawlResult, type CrawlUrl, Crawler, DEFAULT_DAEMON_PORT, DEFAULT_OPTIONS, DaemonClient, type DaemonClientOptions, DaemonServer, type DaemonServerOptions, type DaemonStatus, type HealthStatus, BrowserPool as HeroBrowserPool, type IBrowserPool, InvalidUrlError, NetworkError, NotInitializedError, type Page, type PoolConfig, type PoolStats, type ProxyConfig, type ProxyMetadata, type ProxyRotation, ReaderClient, type ReaderClientOptions, ReaderError, ReaderErrorCode, RobotsBlockedError, type ScrapeOptions, type ScrapeResult, Scraper, TimeoutError, ValidationError, type WebsiteMetadata, type WebsiteScrapeResult, cleanContent, crawl, createHeroConfig, createProxyUrl, detectChallenge, extractMetadata, formatToHTML, formatToMarkdown, getDaemonInfo, getPidFilePath, getUrlKey, handleChallenge, htmlToMarkdown, isChallengePage, isDaemonRunning, isSameDomain, isValidFormat, isValidUrl, parseProxyUrl, rateLimit, resolveUrl, scrape, shouldCrawlUrl, shouldCrawlUrl$1 as shouldCrawlUrlFn, validateUrls, waitForChallengeResolution, waitForSelector, wrapError };