@vakra-dev/reader 0.0.1 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -139,10 +139,8 @@ interface BrowserPoolConfig {
139
139
  interface ScrapeOptions {
140
140
  /** Array of URLs to scrape */
141
141
  urls: string[];
142
- /** Output formats (default: ['markdown']) */
143
- formats?: Array<"markdown" | "html" | "json" | "text">;
144
- /** Include URL, title, timestamp (default: true) */
145
- includeMetadata?: boolean;
142
+ /** Output formats - which content fields to include (default: ['markdown']) */
143
+ formats?: Array<"markdown" | "html">;
146
144
  /** Custom user agent string */
147
145
  userAgent?: string;
148
146
  /** Request timeout in milliseconds (default: 30000) */
@@ -155,6 +153,12 @@ interface ScrapeOptions {
155
153
  removeAds?: boolean;
156
154
  /** Remove base64-encoded images to reduce output size (default: true) */
157
155
  removeBase64Images?: boolean;
156
+ /** Extract only main content, removing nav/header/footer/sidebar (default: true) */
157
+ onlyMainContent?: boolean;
158
+ /** CSS selectors for elements to include (if set, only these elements are kept) */
159
+ includeTags?: string[];
160
+ /** CSS selectors for elements to exclude (removed from output) */
161
+ excludeTags?: string[];
158
162
  /** Skip TLS/SSL certificate verification (default: true) */
159
163
  skipTLSVerification?: boolean;
160
164
  /** Number of URLs to process in parallel (default: 1 - sequential) */
@@ -247,17 +251,13 @@ interface Page {
247
251
  waitTimeMs?: number;
248
252
  }
249
253
  /**
250
- * Individual website scrape result (for backward compatibility)
254
+ * Individual website scrape result
251
255
  */
252
256
  interface WebsiteScrapeResult {
253
- /** Markdown output (present if 'markdown' in formats) */
257
+ /** Markdown content (present if 'markdown' in formats) */
254
258
  markdown?: string;
255
- /** HTML output (present if 'html' in formats) */
259
+ /** HTML content (present if 'html' in formats) */
256
260
  html?: string;
257
- /** JSON output (present if 'json' in formats) */
258
- json?: string;
259
- /** Plain text output (present if 'text' in formats) */
260
- text?: string;
261
261
  /** Metadata about the scraping operation */
262
262
  metadata: {
263
263
  /** Base URL that was scraped */
@@ -317,7 +317,7 @@ declare const DEFAULT_OPTIONS: Omit<Required<ScrapeOptions>, "proxy" | "waitForS
317
317
  /**
318
318
  * Format type guard
319
319
  */
320
- declare function isValidFormat(format: string): format is "markdown" | "html" | "json" | "text";
320
+ declare function isValidFormat(format: string): format is "markdown" | "html";
321
321
  /**
322
322
  * Check if a URL should be crawled based on base domain
323
323
  */
@@ -343,8 +343,8 @@ interface CrawlOptions {
343
343
  includePatterns?: string[];
344
344
  /** URL patterns to exclude (regex strings) - matching URLs are skipped */
345
345
  excludePatterns?: string[];
346
- /** Output formats for scraped content (default: ['markdown', 'html']) */
347
- formats?: Array<"markdown" | "html" | "json" | "text">;
346
+ /** Output formats for scraped content (default: ['markdown']) */
347
+ formats?: Array<"markdown" | "html">;
348
348
  /** Number of URLs to scrape in parallel (default: 2) */
349
349
  scrapeConcurrency?: number;
350
350
  /** Remove ads and tracking elements (default: true) */
@@ -828,31 +828,31 @@ declare class DaemonClient {
828
828
  declare function isDaemonRunning(port?: number): Promise<boolean>;
829
829
 
830
830
  /**
831
- * Convert pages to consolidated Markdown format
831
+ * Convert HTML to Markdown
832
+ *
833
+ * Simple conversion without any headers, metadata, or formatting wrappers.
834
+ * Returns clean markdown content ready for LLM consumption.
832
835
  */
833
- declare function formatToMarkdown(pages: Page[], baseUrl: string, scrapedAt: string, duration: number, website: WebsiteMetadata, includeMetadata?: boolean): string;
834
-
836
+ declare function htmlToMarkdown(html: string): string;
835
837
  /**
836
- * Convert pages to HTML format with metadata
838
+ * Alias for htmlToMarkdown (backward compatibility)
837
839
  */
838
- declare function formatToHTML(pages: Page[], baseUrl: string, scrapedAt: string, duration: number, website: WebsiteMetadata): string;
840
+ declare const formatToMarkdown: typeof htmlToMarkdown;
839
841
 
840
842
  /**
841
- * Convert pages to JSON format with metadata
842
- */
843
- declare function formatToJson(pages: Page[], baseUrl: string, scrapedAt: string, duration: number, website: WebsiteMetadata): string;
844
- /**
845
- * Convert pages to JSON format without HTML (lighter version)
843
+ * HTML formatter
844
+ *
845
+ * Returns the cleaned HTML content as-is.
846
+ * The content has already been processed by content-cleaner.ts
847
+ * (ads removed, base64 images stripped, scripts/styles removed).
846
848
  */
847
- declare function formatToJsonLite(pages: Page[], baseUrl: string, scrapedAt: string, duration: number, website: WebsiteMetadata): string;
848
-
849
849
  /**
850
- * Convert pages to plain text format
850
+ * Return HTML content as-is (already cleaned by content-cleaner)
851
851
  *
852
- * Strips all HTML tags and formatting, preserving only readable text content.
853
- * Useful for LLM consumption where markdown formatting is not needed.
852
+ * This is essentially a pass-through. The cleaning happens in scraper.ts
853
+ * via cleanContent() before this is called.
854
854
  */
855
- declare function formatToText(pages: Page[], baseUrl: string, scrapedAt: string, duration: number, website: WebsiteMetadata, includeMetadata?: boolean): string;
855
+ declare function formatToHTML(html: string): string;
856
856
 
857
857
  /**
858
858
  * Extract comprehensive website metadata from HTML content
@@ -862,6 +862,13 @@ declare function extractMetadata(html: string, baseUrl: string): WebsiteMetadata
862
862
 
863
863
  /**
864
864
  * HTML content cleaning utilities using DOM parsing
865
+ *
866
+ * Layered extraction strategy:
867
+ * 1. Remove scripts, styles, hidden elements (always safe)
868
+ * 2. Remove overlays/modals (always safe)
869
+ * 3. Remove ads (if enabled)
870
+ * 4. Remove navigation with protection (check each element before removing)
871
+ * 5. Find and isolate main content
865
872
  */
866
873
  /**
867
874
  * Content cleaning options
@@ -871,9 +878,15 @@ interface CleaningOptions {
871
878
  removeAds?: boolean;
872
879
  /** Remove base64-encoded images (default: true) */
873
880
  removeBase64Images?: boolean;
881
+ /** Extract only main content, removing nav/header/footer/sidebar (default: true) */
882
+ onlyMainContent?: boolean;
883
+ /** CSS selectors for elements to include (if set, only these elements are kept) */
884
+ includeTags?: string[];
885
+ /** CSS selectors for elements to exclude (removed from output) */
886
+ excludeTags?: string[];
874
887
  }
875
888
  /**
876
- * Clean HTML content (alias for cleanHtml with options)
889
+ * Main export - clean HTML content
877
890
  */
878
891
  declare function cleanContent(html: string, baseUrl: string, options?: CleaningOptions): string;
879
892
 
@@ -895,6 +908,14 @@ declare function isValidUrl(string: string): boolean;
895
908
  declare function isSameDomain(url: string, baseUrl: string): boolean;
896
909
  /**
897
910
  * Generate a URL key for deduplication
911
+ * Normalizes:
912
+ * - Removes fragments (hash)
913
+ * - Removes search params
914
+ * - Removes trailing slashes (except root)
915
+ * - Lowercases
916
+ * - Normalizes www vs non-www
917
+ * - Removes default ports (80 for http, 443 for https)
918
+ * - Normalizes index files (index.html, index.htm, default.html)
898
919
  */
899
920
  declare function getUrlKey(url: string): string;
900
921
  /**
@@ -1083,18 +1104,15 @@ interface ChallengeWaitOptions {
1083
1104
  /**
1084
1105
  * Detect if current page is a Cloudflare challenge
1085
1106
  *
1086
- * Uses multi-signal approach with ONLY challenge-specific indicators.
1087
- * No content length heuristics to avoid false positives.
1107
+ * Uses multi-signal approach requiring BOTH:
1108
+ * 1. Cloudflare infrastructure indicators (cdn-cgi, cf-ray, etc.)
1109
+ * 2. Challenge-specific elements or text
1110
+ *
1111
+ * This prevents false positives on login pages or other sites
1112
+ * that happen to use similar text.
1088
1113
  *
1089
1114
  * @param hero - Hero instance with loaded page
1090
1115
  * @returns Detection result with confidence score and signals
1091
- *
1092
- * @example
1093
- * const detection = await detectChallenge(hero);
1094
- * if (detection.isChallenge) {
1095
- * console.log(`Challenge detected: ${detection.type}`);
1096
- * console.log(`Signals: ${detection.signals.join(', ')}`);
1097
- * }
1098
1116
  */
1099
1117
  declare function detectChallenge(hero: Hero): Promise<ChallengeDetection>;
1100
1118
  /**
@@ -1213,4 +1231,150 @@ declare function createProxyUrl(config: ProxyConfig): string;
1213
1231
  */
1214
1232
  declare function parseProxyUrl(url: string): ProxyConfig;
1215
1233
 
1216
- export { type BatchMetadata, type BrowserInstance, BrowserPool, type BrowserPoolConfig, type ChallengeDetection, type ChallengeResolutionResult, type ChallengeWaitOptions, type CrawlMetadata, type CrawlOptions, type CrawlResult, type CrawlUrl, Crawler, DEFAULT_DAEMON_PORT, DEFAULT_OPTIONS, DaemonClient, type DaemonClientOptions, DaemonServer, type DaemonServerOptions, type DaemonStatus, type HealthStatus, BrowserPool as HeroBrowserPool, type IBrowserPool, type Page, type PoolConfig, type PoolStats, type ProxyConfig, type ProxyMetadata, type ProxyRotation, ReaderClient, type ReaderClientOptions, type ScrapeOptions, type ScrapeResult, Scraper, type WebsiteMetadata, type WebsiteScrapeResult, cleanContent, crawl, createHeroConfig, createProxyUrl, detectChallenge, extractMetadata, formatToHTML, formatToJson, formatToJsonLite, formatToMarkdown, formatToText, getDaemonInfo, getPidFilePath, getUrlKey, handleChallenge, isChallengePage, isDaemonRunning, isSameDomain, isValidFormat, isValidUrl, parseProxyUrl, rateLimit, resolveUrl, scrape, shouldCrawlUrl, shouldCrawlUrl$1 as shouldCrawlUrlFn, validateUrls, waitForChallengeResolution, waitForSelector };
1234
+ /**
1235
+ * Typed error classes for Reader
1236
+ *
1237
+ * Provides actionable error messages and structured error information
1238
+ * for better debugging and error handling.
1239
+ */
1240
+ /**
1241
+ * Error codes for categorization
1242
+ */
1243
+ declare enum ReaderErrorCode {
1244
+ NETWORK_ERROR = "NETWORK_ERROR",
1245
+ TIMEOUT = "TIMEOUT",
1246
+ CONNECTION_REFUSED = "CONNECTION_REFUSED",
1247
+ CLOUDFLARE_CHALLENGE = "CLOUDFLARE_CHALLENGE",
1248
+ BOT_DETECTED = "BOT_DETECTED",
1249
+ ACCESS_DENIED = "ACCESS_DENIED",
1250
+ CONTENT_EXTRACTION_FAILED = "CONTENT_EXTRACTION_FAILED",
1251
+ EMPTY_CONTENT = "EMPTY_CONTENT",
1252
+ INVALID_URL = "INVALID_URL",
1253
+ INVALID_OPTIONS = "INVALID_OPTIONS",
1254
+ ROBOTS_BLOCKED = "ROBOTS_BLOCKED",
1255
+ BROWSER_ERROR = "BROWSER_ERROR",
1256
+ POOL_EXHAUSTED = "POOL_EXHAUSTED",
1257
+ CLIENT_CLOSED = "CLIENT_CLOSED",
1258
+ NOT_INITIALIZED = "NOT_INITIALIZED",
1259
+ UNKNOWN = "UNKNOWN"
1260
+ }
1261
+ /**
1262
+ * Base error class for all Reader errors
1263
+ */
1264
+ declare class ReaderError extends Error {
1265
+ readonly code: ReaderErrorCode;
1266
+ readonly url?: string;
1267
+ readonly cause?: Error;
1268
+ readonly timestamp: string;
1269
+ readonly retryable: boolean;
1270
+ constructor(message: string, code: ReaderErrorCode, options?: {
1271
+ url?: string;
1272
+ cause?: Error;
1273
+ retryable?: boolean;
1274
+ });
1275
+ /**
1276
+ * Convert to a plain object for serialization
1277
+ */
1278
+ toJSON(): Record<string, unknown>;
1279
+ }
1280
+ /**
1281
+ * Network-related errors (connection issues, DNS failures, etc.)
1282
+ */
1283
+ declare class NetworkError extends ReaderError {
1284
+ constructor(message: string, options?: {
1285
+ url?: string;
1286
+ cause?: Error;
1287
+ });
1288
+ }
1289
+ /**
1290
+ * Timeout errors (page load, navigation, etc.)
1291
+ */
1292
+ declare class TimeoutError extends ReaderError {
1293
+ readonly timeoutMs: number;
1294
+ constructor(message: string, timeoutMs: number, options?: {
1295
+ url?: string;
1296
+ cause?: Error;
1297
+ });
1298
+ toJSON(): Record<string, unknown>;
1299
+ }
1300
+ /**
1301
+ * Cloudflare challenge errors
1302
+ */
1303
+ declare class CloudflareError extends ReaderError {
1304
+ readonly challengeType: string;
1305
+ constructor(challengeType: string, options?: {
1306
+ url?: string;
1307
+ cause?: Error;
1308
+ });
1309
+ toJSON(): Record<string, unknown>;
1310
+ }
1311
+ /**
1312
+ * Access denied errors (blocked, forbidden, etc.)
1313
+ */
1314
+ declare class AccessDeniedError extends ReaderError {
1315
+ readonly statusCode?: number;
1316
+ constructor(message: string, options?: {
1317
+ url?: string;
1318
+ statusCode?: number;
1319
+ cause?: Error;
1320
+ });
1321
+ toJSON(): Record<string, unknown>;
1322
+ }
1323
+ /**
1324
+ * Content extraction errors
1325
+ */
1326
+ declare class ContentExtractionError extends ReaderError {
1327
+ constructor(message: string, options?: {
1328
+ url?: string;
1329
+ cause?: Error;
1330
+ });
1331
+ }
1332
+ /**
1333
+ * Validation errors (invalid URLs, options, etc.)
1334
+ */
1335
+ declare class ValidationError extends ReaderError {
1336
+ readonly field?: string;
1337
+ constructor(message: string, options?: {
1338
+ field?: string;
1339
+ url?: string;
1340
+ });
1341
+ toJSON(): Record<string, unknown>;
1342
+ }
1343
+ /**
1344
+ * URL validation error
1345
+ */
1346
+ declare class InvalidUrlError extends ReaderError {
1347
+ constructor(url: string, reason?: string);
1348
+ }
1349
+ /**
1350
+ * Robots.txt blocked error
1351
+ */
1352
+ declare class RobotsBlockedError extends ReaderError {
1353
+ constructor(url: string);
1354
+ }
1355
+ /**
1356
+ * Browser pool errors
1357
+ */
1358
+ declare class BrowserPoolError extends ReaderError {
1359
+ constructor(message: string, options?: {
1360
+ cause?: Error;
1361
+ });
1362
+ }
1363
+ /**
1364
+ * Client state errors
1365
+ */
1366
+ declare class ClientClosedError extends ReaderError {
1367
+ constructor();
1368
+ }
1369
+ /**
1370
+ * Not initialized error
1371
+ */
1372
+ declare class NotInitializedError extends ReaderError {
1373
+ constructor(component: string);
1374
+ }
1375
+ /**
1376
+ * Helper to wrap unknown errors in ReaderError
1377
+ */
1378
+ declare function wrapError(error: unknown, url?: string): ReaderError;
1379
+
1380
+ export { AccessDeniedError, type BatchMetadata, type BrowserInstance, BrowserPool, type BrowserPoolConfig, BrowserPoolError, type ChallengeDetection, type ChallengeResolutionResult, type ChallengeWaitOptions, ClientClosedError, CloudflareError, ContentExtractionError, type CrawlMetadata, type CrawlOptions, type CrawlResult, type CrawlUrl, Crawler, DEFAULT_DAEMON_PORT, DEFAULT_OPTIONS, DaemonClient, type DaemonClientOptions, DaemonServer, type DaemonServerOptions, type DaemonStatus, type HealthStatus, BrowserPool as HeroBrowserPool, type IBrowserPool, InvalidUrlError, NetworkError, NotInitializedError, type Page, type PoolConfig, type PoolStats, type ProxyConfig, type ProxyMetadata, type ProxyRotation, ReaderClient, type ReaderClientOptions, ReaderError, ReaderErrorCode, RobotsBlockedError, type ScrapeOptions, type ScrapeResult, Scraper, TimeoutError, ValidationError, type WebsiteMetadata, type WebsiteScrapeResult, cleanContent, crawl, createHeroConfig, createProxyUrl, detectChallenge, extractMetadata, formatToHTML, formatToMarkdown, getDaemonInfo, getPidFilePath, getUrlKey, handleChallenge, htmlToMarkdown, isChallengePage, isDaemonRunning, isSameDomain, isValidFormat, isValidUrl, parseProxyUrl, rateLimit, resolveUrl, scrape, shouldCrawlUrl, shouldCrawlUrl$1 as shouldCrawlUrlFn, validateUrls, waitForChallengeResolution, waitForSelector, wrapError };