npm - @vakra-dev/reader - Versions diffs - 0.0.2 → 0.1.0 - Mend

@vakra-dev/reader 0.0.2 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/dist/index.d.ts CHANGED Viewed

@@ -90,6 +90,20 @@ interface IBrowserPool {
     healthCheck?(): Promise<HealthStatus>;
 }
+/**
+ * Engine types for multi-engine scraping architecture
+ *
+ * Engine stack (in order of preference):
+ * 1. http - Native fetch, fastest, no browser
+ * 2. tlsclient - TLS fingerprinting via got-scraping
+ * 3. hero - Full browser with JavaScript execution
+ */
+/**
+ * Available engine names
+ */
+type EngineName = "http" | "tlsclient" | "hero";
 /**
  * Proxy configuration for Hero
  */
@@ -139,12 +153,12 @@ interface BrowserPoolConfig {
 interface ScrapeOptions {
     /** Array of URLs to scrape */
     urls: string[];
-    /** Output formats (default: ['markdown']) */
-    formats?: Array<"markdown" | "html" | "json" | "text">;
-    /** Include URL, title, timestamp (default: true) */
-    includeMetadata?: boolean;
+    /** Output formats - which content fields to include (default: ['markdown']) */
+    formats?: Array<"markdown" | "html">;
     /** Custom user agent string */
     userAgent?: string;
+    /** Custom headers for requests */
+    headers?: Record<string, string>;
     /** Request timeout in milliseconds (default: 30000) */
     timeoutMs?: number;
     /** URL patterns to include (regex strings) */
@@ -155,6 +169,12 @@ interface ScrapeOptions {
     removeAds?: boolean;
     /** Remove base64-encoded images to reduce output size (default: true) */
     removeBase64Images?: boolean;
+    /** Extract only main content, removing nav/header/footer/sidebar (default: true) */
+    onlyMainContent?: boolean;
+    /** CSS selectors for elements to include (if set, only these elements are kept) */
+    includeTags?: string[];
+    /** CSS selectors for elements to exclude (removed from output) */
+    excludeTags?: string[];
     /** Skip TLS/SSL certificate verification (default: true) */
     skipTLSVerification?: boolean;
     /** Number of URLs to process in parallel (default: 1 - sequential) */
@@ -183,6 +203,12 @@ interface ScrapeOptions {
     browserPool?: BrowserPoolConfig;
     /** Browser pool instance (internal, provided by ReaderClient) */
     pool?: IBrowserPool;
+    /** Engines to use in order (default: ['http', 'tlsclient', 'hero']) */
+    engines?: EngineName[];
+    /** Skip specific engines (e.g., ['http'] to skip native fetch) */
+    skipEngines?: EngineName[];
+    /** Force a specific engine, skipping the cascade */
+    forceEngine?: EngineName;
 }
 /**
  * Website metadata extracted from the base page
@@ -247,17 +273,13 @@ interface Page {
     waitTimeMs?: number;
 }
 /**
- * Individual website scrape result (for backward compatibility)
+ * Individual website scrape result
  */
 interface WebsiteScrapeResult {
-    /** Markdown output (present if 'markdown' in formats) */
+    /** Markdown content (present if 'markdown' in formats) */
     markdown?: string;
-    /** HTML output (present if 'html' in formats) */
+    /** HTML content (present if 'html' in formats) */
     html?: string;
-    /** JSON output (present if 'json' in formats) */
-    json?: string;
-    /** Plain text output (present if 'text' in formats) */
-    text?: string;
     /** Metadata about the scraping operation */
     metadata: {
         /** Base URL that was scraped */
@@ -306,18 +328,22 @@ interface ScrapeResult {
 /**
  * Default scrape options
  */
-declare const DEFAULT_OPTIONS: Omit<Required<ScrapeOptions>, "proxy" | "waitForSelector" | "connectionToCore" | "userAgent" | "browserPool" | "pool"> & {
+declare const DEFAULT_OPTIONS: Omit<Required<ScrapeOptions>, "proxy" | "waitForSelector" | "connectionToCore" | "userAgent" | "headers" | "browserPool" | "pool" | "engines" | "skipEngines" | "forceEngine"> & {
     proxy?: ProxyConfig;
     waitForSelector?: string;
     connectionToCore?: any;
     userAgent?: string;
+    headers?: Record<string, string>;
     browserPool?: BrowserPoolConfig;
     pool?: IBrowserPool;
+    engines?: EngineName[];
+    skipEngines?: EngineName[];
+    forceEngine?: EngineName;
 };
 /**
  * Format type guard
  */
-declare function isValidFormat(format: string): format is "markdown" | "html" | "json" | "text";
+declare function isValidFormat(format: string): format is "markdown" | "html";
 /**
  * Check if a URL should be crawled based on base domain
  */
@@ -343,8 +369,8 @@ interface CrawlOptions {
     includePatterns?: string[];
     /** URL patterns to exclude (regex strings) - matching URLs are skipped */
     excludePatterns?: string[];
-    /** Output formats for scraped content (default: ['markdown', 'html']) */
-    formats?: Array<"markdown" | "html" | "json" | "text">;
+    /** Output formats for scraped content (default: ['markdown']) */
+    formats?: Array<"markdown" | "html">;
     /** Number of URLs to scrape in parallel (default: 2) */
     scrapeConcurrency?: number;
     /** Remove ads and tracking elements (default: true) */
@@ -547,7 +573,6 @@ declare class ReaderClient {
  */
 declare class Scraper {
     private options;
-    private pool;
     private logger;
     private robotsCache;
     constructor(options: ScrapeOptions);
@@ -570,13 +595,7 @@ declare class Scraper {
      */
     private scrapeSingleUrlWithRetry;
     /**
-     * Wait for the final page to load after any Cloudflare redirects
-     * Cloudflare often does silent redirects even when bypassed, we need to ensure
-     * we're on the actual content page before scraping.
-     */
-    private waitForFinalPage;
-    /**
-     * Scrape a single URL
+     * Scrape a single URL using the engine orchestrator
      */
     private scrapeSingleUrl;
     /**
@@ -828,31 +847,31 @@ declare class DaemonClient {
 declare function isDaemonRunning(port?: number): Promise<boolean>;
 /**
- * Convert pages to consolidated Markdown format
+ * Convert HTML to Markdown
+ *
+ * Simple conversion without any headers, metadata, or formatting wrappers.
+ * Returns clean markdown content ready for LLM consumption.
  */
-declare function formatToMarkdown(pages: Page[], baseUrl: string, scrapedAt: string, duration: number, website: WebsiteMetadata, includeMetadata?: boolean): string;
+declare function htmlToMarkdown(html: string): string;
 /**
- * Convert pages to HTML format with metadata
+ * Alias for htmlToMarkdown (backward compatibility)
  */
-declare function formatToHTML(pages: Page[], baseUrl: string, scrapedAt: string, duration: number, website: WebsiteMetadata): string;
+declare const formatToMarkdown: typeof htmlToMarkdown;
 /**
- * Convert pages to JSON format with metadata
- */
-declare function formatToJson(pages: Page[], baseUrl: string, scrapedAt: string, duration: number, website: WebsiteMetadata): string;
-/**
- * Convert pages to JSON format without HTML (lighter version)
+ * HTML formatter
+ *
+ * Returns the cleaned HTML content as-is.
+ * The content has already been processed by content-cleaner.ts
+ * (ads removed, base64 images stripped, scripts/styles removed).
  */
-declare function formatToJsonLite(pages: Page[], baseUrl: string, scrapedAt: string, duration: number, website: WebsiteMetadata): string;
 /**
- * Convert pages to plain text format
+ * Return HTML content as-is (already cleaned by content-cleaner)
  *
- * Strips all HTML tags and formatting, preserving only readable text content.
- * Useful for LLM consumption where markdown formatting is not needed.
+ * This is essentially a pass-through. The cleaning happens in scraper.ts
+ * via cleanContent() before this is called.
  */
-declare function formatToText(pages: Page[], baseUrl: string, scrapedAt: string, duration: number, website: WebsiteMetadata, includeMetadata?: boolean): string;
+declare function formatToHTML(html: string): string;
 /**
  * Extract comprehensive website metadata from HTML content
@@ -862,6 +881,13 @@ declare function extractMetadata(html: string, baseUrl: string): WebsiteMetadata
 /**
  * HTML content cleaning utilities using DOM parsing
+ *
+ * Layered extraction strategy:
+ * 1. Remove scripts, styles, hidden elements (always safe)
+ * 2. Remove overlays/modals (always safe)
+ * 3. Remove ads (if enabled)
+ * 4. Remove navigation with protection (check each element before removing)
+ * 5. Find and isolate main content
  */
 /**
  * Content cleaning options
@@ -871,9 +897,15 @@ interface CleaningOptions {
     removeAds?: boolean;
     /** Remove base64-encoded images (default: true) */
     removeBase64Images?: boolean;
+    /** Extract only main content, removing nav/header/footer/sidebar (default: true) */
+    onlyMainContent?: boolean;
+    /** CSS selectors for elements to include (if set, only these elements are kept) */
+    includeTags?: string[];
+    /** CSS selectors for elements to exclude (removed from output) */
+    excludeTags?: string[];
 }
 /**
- * Clean HTML content (alias for cleanHtml with options)
+ * Main export - clean HTML content
  */
 declare function cleanContent(html: string, baseUrl: string, options?: CleaningOptions): string;
@@ -895,6 +927,14 @@ declare function isValidUrl(string: string): boolean;
 declare function isSameDomain(url: string, baseUrl: string): boolean;
 /**
  * Generate a URL key for deduplication
+ * Normalizes:
+ * - Removes fragments (hash)
+ * - Removes search params
+ * - Removes trailing slashes (except root)
+ * - Lowercases
+ * - Normalizes www vs non-www
+ * - Removes default ports (80 for http, 443 for https)
+ * - Normalizes index files (index.html, index.htm, default.html)
  */
 declare function getUrlKey(url: string): string;
 /**
@@ -1083,18 +1123,15 @@ interface ChallengeWaitOptions {
 /**
  * Detect if current page is a Cloudflare challenge
  *
- * Uses multi-signal approach with ONLY challenge-specific indicators.
- * No content length heuristics to avoid false positives.
+ * Uses multi-signal approach requiring BOTH:
+ * 1. Cloudflare infrastructure indicators (cdn-cgi, cf-ray, etc.)
+ * 2. Challenge-specific elements or text
+ *
+ * This prevents false positives on login pages or other sites
+ * that happen to use similar text.
  *
  * @param hero - Hero instance with loaded page
  * @returns Detection result with confidence score and signals
- *
- * @example
- * const detection = await detectChallenge(hero);
- * if (detection.isChallenge) {
- *   console.log(`Challenge detected: ${detection.type}`);
- *   console.log(`Signals: ${detection.signals.join(', ')}`);
- * }
  */
 declare function detectChallenge(hero: Hero): Promise<ChallengeDetection>;
 /**
@@ -1213,4 +1250,150 @@ declare function createProxyUrl(config: ProxyConfig): string;
  */
 declare function parseProxyUrl(url: string): ProxyConfig;
-export { type BatchMetadata, type BrowserInstance, BrowserPool, type BrowserPoolConfig, type ChallengeDetection, type ChallengeResolutionResult, type ChallengeWaitOptions, type CrawlMetadata, type CrawlOptions, type CrawlResult, type CrawlUrl, Crawler, DEFAULT_DAEMON_PORT, DEFAULT_OPTIONS, DaemonClient, type DaemonClientOptions, DaemonServer, type DaemonServerOptions, type DaemonStatus, type HealthStatus, BrowserPool as HeroBrowserPool, type IBrowserPool, type Page, type PoolConfig, type PoolStats, type ProxyConfig, type ProxyMetadata, type ProxyRotation, ReaderClient, type ReaderClientOptions, type ScrapeOptions, type ScrapeResult, Scraper, type WebsiteMetadata, type WebsiteScrapeResult, cleanContent, crawl, createHeroConfig, createProxyUrl, detectChallenge, extractMetadata, formatToHTML, formatToJson, formatToJsonLite, formatToMarkdown, formatToText, getDaemonInfo, getPidFilePath, getUrlKey, handleChallenge, isChallengePage, isDaemonRunning, isSameDomain, isValidFormat, isValidUrl, parseProxyUrl, rateLimit, resolveUrl, scrape, shouldCrawlUrl, shouldCrawlUrl$1 as shouldCrawlUrlFn, validateUrls, waitForChallengeResolution, waitForSelector };
+/**
+ * Typed error classes for Reader
+ *
+ * Provides actionable error messages and structured error information
+ * for better debugging and error handling.
+ */
+/**
+ * Error codes for categorization
+ */
+declare enum ReaderErrorCode {
+    NETWORK_ERROR = "NETWORK_ERROR",
+    TIMEOUT = "TIMEOUT",
+    CONNECTION_REFUSED = "CONNECTION_REFUSED",
+    CLOUDFLARE_CHALLENGE = "CLOUDFLARE_CHALLENGE",
+    BOT_DETECTED = "BOT_DETECTED",
+    ACCESS_DENIED = "ACCESS_DENIED",
+    CONTENT_EXTRACTION_FAILED = "CONTENT_EXTRACTION_FAILED",
+    EMPTY_CONTENT = "EMPTY_CONTENT",
+    INVALID_URL = "INVALID_URL",
+    INVALID_OPTIONS = "INVALID_OPTIONS",
+    ROBOTS_BLOCKED = "ROBOTS_BLOCKED",
+    BROWSER_ERROR = "BROWSER_ERROR",
+    POOL_EXHAUSTED = "POOL_EXHAUSTED",
+    CLIENT_CLOSED = "CLIENT_CLOSED",
+    NOT_INITIALIZED = "NOT_INITIALIZED",
+    UNKNOWN = "UNKNOWN"
+}
+/**
+ * Base error class for all Reader errors
+ */
+declare class ReaderError extends Error {
+    readonly code: ReaderErrorCode;
+    readonly url?: string;
+    readonly cause?: Error;
+    readonly timestamp: string;
+    readonly retryable: boolean;
+    constructor(message: string, code: ReaderErrorCode, options?: {
+        url?: string;
+        cause?: Error;
+        retryable?: boolean;
+    });
+    /**
+     * Convert to a plain object for serialization
+     */
+    toJSON(): Record<string, unknown>;
+}
+/**
+ * Network-related errors (connection issues, DNS failures, etc.)
+ */
+declare class NetworkError extends ReaderError {
+    constructor(message: string, options?: {
+        url?: string;
+        cause?: Error;
+    });
+}
+/**
+ * Timeout errors (page load, navigation, etc.)
+ */
+declare class TimeoutError extends ReaderError {
+    readonly timeoutMs: number;
+    constructor(message: string, timeoutMs: number, options?: {
+        url?: string;
+        cause?: Error;
+    });
+    toJSON(): Record<string, unknown>;
+}
+/**
+ * Cloudflare challenge errors
+ */
+declare class CloudflareError extends ReaderError {
+    readonly challengeType: string;
+    constructor(challengeType: string, options?: {
+        url?: string;
+        cause?: Error;
+    });
+    toJSON(): Record<string, unknown>;
+}
+/**
+ * Access denied errors (blocked, forbidden, etc.)
+ */
+declare class AccessDeniedError extends ReaderError {
+    readonly statusCode?: number;
+    constructor(message: string, options?: {
+        url?: string;
+        statusCode?: number;
+        cause?: Error;
+    });
+    toJSON(): Record<string, unknown>;
+}
+/**
+ * Content extraction errors
+ */
+declare class ContentExtractionError extends ReaderError {
+    constructor(message: string, options?: {
+        url?: string;
+        cause?: Error;
+    });
+}
+/**
+ * Validation errors (invalid URLs, options, etc.)
+ */
+declare class ValidationError extends ReaderError {
+    readonly field?: string;
+    constructor(message: string, options?: {
+        field?: string;
+        url?: string;
+    });
+    toJSON(): Record<string, unknown>;
+}
+/**
+ * URL validation error
+ */
+declare class InvalidUrlError extends ReaderError {
+    constructor(url: string, reason?: string);
+}
+/**
+ * Robots.txt blocked error
+ */
+declare class RobotsBlockedError extends ReaderError {
+    constructor(url: string);
+}
+/**
+ * Browser pool errors
+ */
+declare class BrowserPoolError extends ReaderError {
+    constructor(message: string, options?: {
+        cause?: Error;
+    });
+}
+/**
+ * Client state errors
+ */
+declare class ClientClosedError extends ReaderError {
+    constructor();
+}
+/**
+ * Not initialized error
+ */
+declare class NotInitializedError extends ReaderError {
+    constructor(component: string);
+}
+/**
+ * Helper to wrap unknown errors in ReaderError
+ */
+declare function wrapError(error: unknown, url?: string): ReaderError;
+export { AccessDeniedError, type BatchMetadata, type BrowserInstance, BrowserPool, type BrowserPoolConfig, BrowserPoolError, type ChallengeDetection, type ChallengeResolutionResult, type ChallengeWaitOptions, ClientClosedError, CloudflareError, ContentExtractionError, type CrawlMetadata, type CrawlOptions, type CrawlResult, type CrawlUrl, Crawler, DEFAULT_DAEMON_PORT, DEFAULT_OPTIONS, DaemonClient, type DaemonClientOptions, DaemonServer, type DaemonServerOptions, type DaemonStatus, type HealthStatus, BrowserPool as HeroBrowserPool, type IBrowserPool, InvalidUrlError, NetworkError, NotInitializedError, type Page, type PoolConfig, type PoolStats, type ProxyConfig, type ProxyMetadata, type ProxyRotation, ReaderClient, type ReaderClientOptions, ReaderError, ReaderErrorCode, RobotsBlockedError, type ScrapeOptions, type ScrapeResult, Scraper, TimeoutError, ValidationError, type WebsiteMetadata, type WebsiteScrapeResult, cleanContent, crawl, createHeroConfig, createProxyUrl, detectChallenge, extractMetadata, formatToHTML, formatToMarkdown, getDaemonInfo, getPidFilePath, getUrlKey, handleChallenge, htmlToMarkdown, isChallengePage, isDaemonRunning, isSameDomain, isValidFormat, isValidUrl, parseProxyUrl, rateLimit, resolveUrl, scrape, shouldCrawlUrl, shouldCrawlUrl$1 as shouldCrawlUrlFn, validateUrls, waitForChallengeResolution, waitForSelector, wrapError };