npm - @doclo/core - Versions diffs - 0.1.5 - Mend

@doclo/core 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

package/LICENSE +21 -0
package/README.md +34 -0
package/dist/index.d.ts +931 -0
package/dist/index.js +2293 -0
package/dist/index.js.map +1 -0
package/dist/internal/validation-utils.d.ts +1 -0
package/dist/internal/validation-utils.js +650 -0
package/dist/internal/validation-utils.js.map +1 -0
package/dist/observability/index.d.ts +933 -0
package/dist/observability/index.js +630 -0
package/dist/observability/index.js.map +1 -0
package/dist/pdf-utils.d.ts +123 -0
package/dist/pdf-utils.js +106 -0
package/dist/pdf-utils.js.map +1 -0
package/dist/runtime/base64.d.ts +100 -0
package/dist/runtime/base64.js +52 -0
package/dist/runtime/base64.js.map +1 -0
package/dist/runtime/crypto.d.ts +56 -0
package/dist/runtime/crypto.js +35 -0
package/dist/runtime/crypto.js.map +1 -0
package/dist/runtime/env.d.ts +130 -0
package/dist/runtime/env.js +76 -0
package/dist/runtime/env.js.map +1 -0
package/dist/security/index.d.ts +236 -0
package/dist/security/index.js +260 -0
package/dist/security/index.js.map +1 -0
package/dist/validation-CzOz6fwq.d.ts +1126 -0
package/dist/validation.d.ts +1 -0
package/dist/validation.js +445 -0
package/dist/validation.js.map +1 -0
package/package.json +70 -0

package/dist/index.d.ts ADDED Viewed

@@ -0,0 +1,931 @@
+import { P as ProviderVendor, A as AccessMethod } from './validation-CzOz6fwq.js';
+export { z as AggregatedMetrics, B as BBox, r as CategorizeNodeConfig, s as ChunkMetadata, u as ChunkNodeConfig, t as ChunkOutput, n as CitationConfig, k as CitationSourceType, v as CombineNodeConfig, T as CompatibilityRule, C as ConsensusConfig, e as ConsensusMetadata, d as ConsensusRunResult, D as DocumentIR, b as DocumentIRExtras, x as EnhancedExtractionSchema, E as ExtractNodeConfig, $ as ExtractedImage, m as FieldCitation, F as FieldVotingDetails, G as FlowContext, a6 as FlowExecutionError, h as FlowInput, i as FlowInputValidation, j as FlowResult, a7 as FlowValidationError, I as IRLine, a as IRPage, W as JSONSchemaNode, c as LLMJsonProvider, L as LLMProvider, Z as LanguageOptions, l as LineCitation, g as MaybeWithConsensusMetadata, M as MultimodalInput, a8 as NODE_COMPATIBILITY_MATRIX, H as NodeCtx, K as NodeDef, J as NodeTypeInfo, Q as NodeTypeName, N as NormalizedBBox, O as OCRProvider, a0 as OCRProviderOptions, w as OutputNodeConfig, o as OutputWithCitations, f as OutputWithConsensus, Y as PageRangeOptions, p as ParseNodeConfig, X as ProcessingMode, a2 as ProviderCitation, aj as ProviderIdentity, ah as RESERVED_VARIABLES, R as ReasoningConfig, _ as SegmentationResult, S as SplitDocument, q as SplitNodeConfig, y as StepMetric, V as VLMProvider, a1 as VLMProviderOptions, U as ValidationResult, a3 as aggregateMetrics, af as canStartForEachItemFlow, an as createIdentity, ab as getCompatibleTargets, aa as getNodeTypeInfo, a9 as getNodeTypeName, ac as getSuggestedConnections, ae as getValidForEachStarters, am as isLocalEndpoint, a4 as node, al as parseProviderString, ai as protectReservedVariables, a5 as runPipeline, ak as toProviderString, ag as validateJson, ad as validateNodeConnection } from './validation-CzOz6fwq.js';
+export { getDocumentPageCount, getPDFPageCount, getPageCountMetadata, getTotalPageCount, splitPDFIntoChunks } from './pdf-utils.js';
+/**
+ * File utilities for universal runtime (Edge Runtime + Node.js compatible)
+ *
+ * These utilities work in both Edge Runtime and Node.js environments.
+ * File system operations have been removed for Edge Runtime compatibility.
+ */
+/**
+ * Supported document MIME types that can be detected.
+ * This includes all formats supported by at least one provider:
+ * - Datalab: PDF, images, Office, OpenDocument, HTML, EPUB
+ * - Reducto: PDF, images (incl. HEIC, BMP, PSD), Office, RTF, TXT, CSV
+ * - Unsiloed: PDF, images, Office (DOCX, XLSX, PPTX)
+ */
+type DocumentMimeType = 'application/pdf' | 'image/jpeg' | 'image/png' | 'image/gif' | 'image/webp' | 'image/tiff' | 'image/bmp' | 'image/heic' | 'image/heif' | 'image/vnd.adobe.photoshop' | 'application/msword' | 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' | 'application/vnd.ms-excel' | 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' | 'application/vnd.ms-powerpoint' | 'application/vnd.openxmlformats-officedocument.presentationml.presentation' | 'application/vnd.oasis.opendocument.text' | 'application/vnd.oasis.opendocument.spreadsheet' | 'application/vnd.oasis.opendocument.presentation' | 'text/plain' | 'text/csv' | 'text/html' | 'application/rtf' | 'application/epub+zip' | 'unknown';
+/**
+ * Security limits configuration for file operations
+ * @internal
+ */
+interface FileLimitsConfig {
+    /** Maximum file size in bytes (default: 100MB) - ⚠️ WARNING: Increasing this exposes to resource exhaustion attacks */
+    maxFileSize?: number;
+    /** Request timeout in milliseconds (default: 30s) - ⚠️ WARNING: Decreasing this may cause legitimate requests to fail */
+    requestTimeout?: number;
+}
+declare function detectDocumentType(input: string | undefined): DocumentMimeType;
+/**
+ * Check if input represents a PDF document
+ *
+ * Handles various input formats:
+ * - Data URLs with MIME type
+ * - File paths with .pdf extension
+ * - HTTP/HTTPS URLs (with or without query parameters)
+ * - Raw base64 strings (detected via magic bytes)
+ *
+ * @param input - Document input (data URL, file path, URL, or raw base64)
+ * @returns true if input appears to be a PDF
+ *
+ * @example
+ * ```typescript
+ * isPDFDocument('data:application/pdf;base64,...')  // true
+ * isPDFDocument('./document.pdf')                   // true
+ * isPDFDocument('https://example.com/doc.pdf?token=123')  // true
+ * isPDFDocument('JVBERi0xLjQK...')                  // true (raw base64 PDF)
+ * isPDFDocument('data:image/jpeg;base64,...')       // false
+ * ```
+ */
+declare function isPDFDocument(input: string | undefined): boolean;
+/**
+ * Resolve document from any source (URL or data URI) to base64 data URL
+ *
+ * Supports two input types:
+ * - HTTP/HTTPS URLs: 'https://example.com/document.pdf'
+ * - Data URIs: 'data:application/pdf;base64,JVBERi0x...'
+ *
+ * Note: File paths are NOT supported in Edge Runtime.
+ * Use HTTP URLs, data URIs, or pass ArrayBuffer/base64 directly.
+ *
+ * @param input - Document source (URL or data URI)
+ * @param limits - Optional security limits for file size and request timeout (uses secure defaults if not specified)
+ * @returns Promise resolving to base64 data URL
+ *
+ * @example
+ * ```typescript
+ * // Remote URL
+ * const dataUrl = await resolveDocument('https://example.com/doc.pdf');
+ *
+ * // Remote URL with custom timeout
+ * const dataUrl = await resolveDocument('https://example.com/doc.pdf', { requestTimeout: 60000 });
+ *
+ * // Data URI (pass-through)
+ * const dataUrl = await resolveDocument('data:application/pdf;base64,JVBERi0x...');
+ *
+ * // For ArrayBuffer, use bufferToDataUri() instead
+ * const dataUrl = bufferToDataUri(arrayBuffer, 'application/pdf');
+ * ```
+ */
+declare function resolveDocument(input: string, limits?: FileLimitsConfig): Promise<string>;
+/**
+ * Convert ArrayBuffer or Uint8Array to base64 data URI
+ *
+ * Edge Runtime compatible - no file system access required.
+ *
+ * @param buffer - File buffer (ArrayBuffer or Uint8Array)
+ * @param mimeType - MIME type (e.g., 'application/pdf', 'image/jpeg')
+ * @returns Base64 data URI string
+ *
+ * @example
+ * ```typescript
+ * // From ArrayBuffer
+ * const buffer = await response.arrayBuffer();
+ * const dataUri = bufferToDataUri(buffer, 'application/pdf');
+ *
+ * // From Uint8Array
+ * const bytes = new Uint8Array([72, 101, 108, 108, 111]);
+ * const dataUri = bufferToDataUri(bytes, 'text/plain');
+ * ```
+ */
+declare function bufferToDataUri(buffer: ArrayBuffer | Uint8Array, mimeType: string): string;
+/**
+ * @deprecated Use bufferToDataUri() instead. This function will be removed in v0.2.0.
+ */
+declare function bufferToBase64(buffer: ArrayBuffer | Uint8Array, mimeType: string): string;
+/**
+ * Accepted MIME types for flow input validation.
+ * Excludes 'unknown' - only known provider-supported formats.
+ */
+type AcceptedMimeType = Exclude<DocumentMimeType, 'unknown'>;
+/**
+ * Error thrown when flow input doesn't match accepted formats
+ */
+declare class FlowInputValidationError extends Error {
+    readonly detectedType: string;
+    readonly acceptedTypes: string[];
+    /**
+     * @param message - Human-readable error message
+     * @param detectedType - The actual MIME type detected from the input
+     * @param acceptedTypes - List of MIME types that would have been accepted
+     */
+    constructor(message: string, detectedType: string, acceptedTypes: string[]);
+}
+/**
+ * Validate flow input against accepted MIME type formats
+ *
+ * @param input - Flow input string (base64, data URL, or URL)
+ * @param acceptedFormats - List of accepted MIME types
+ * @returns The detected MIME type if valid
+ * @throws FlowInputValidationError if format doesn't match accepted types
+ *
+ * @example
+ * ```typescript
+ * // Validate PDF only
+ * const mimeType = validateFlowInputFormat(pdfBase64, ['application/pdf']);
+ *
+ * // Validate images only
+ * const mimeType = validateFlowInputFormat(jpgBase64, ['image/jpeg', 'image/png']);
+ *
+ * // Will throw FlowInputValidationError if input is a PDF but only images accepted
+ * ```
+ */
+declare function validateFlowInputFormat(input: string | undefined, acceptedFormats: AcceptedMimeType[]): AcceptedMimeType;
+/**
+ * Provider Configuration
+ *
+ * Serializable provider configurations for doclo-sdk.
+ * These configs can be stored in databases and reconstructed at runtime.
+ */
+/**
+ * Base provider configuration
+ */
+type BaseProviderConfig = {
+    id: string;
+    name?: string;
+};
+/**
+ * VLM (Vision Language Model) provider configuration
+ */
+type VLMProviderConfig = BaseProviderConfig & {
+    type: 'vlm';
+    provider: 'openai' | 'anthropic' | 'google' | 'xai';
+    model: string;
+    via?: 'openrouter' | 'native';
+    baseUrl?: string;
+};
+/**
+ * OCR provider configuration
+ */
+type OCRProviderConfig = BaseProviderConfig & ({
+    type: 'ocr';
+    provider: 'surya';
+    endpoint?: string;
+} | {
+    type: 'ocr';
+    provider: 'marker';
+    force_ocr?: boolean;
+    use_llm?: boolean;
+});
+/**
+ * All provider configurations
+ */
+type ProviderConfig = VLMProviderConfig | OCRProviderConfig;
+/**
+ * Provider secrets (API keys, credentials)
+ * Stored separately from provider configs for security
+ */
+type ProviderSecrets = Record<string, {
+    apiKey?: string;
+    /** Additional secret values (e.g., endpoint URLs, tokens) */
+    [key: string]: string | undefined;
+}>;
+/**
+ * Base provider interface - common methods shared by all providers
+ */
+interface ProviderInstance {
+    /** Optional provider name for identification */
+    name?: string;
+    /** Capabilities of this provider instance */
+    capabilities?: Record<string, unknown>;
+}
+/**
+ * Provider registry - maps provider IDs to instantiated providers
+ * Uses a generic constraint to allow type narrowing when the provider type is known
+ */
+type ProviderRegistry<T extends ProviderInstance = ProviderInstance> = Record<string, T>;
+/**
+ * Helper to create VLM provider config
+ */
+declare function defineVLMProvider(config: Omit<VLMProviderConfig, 'type'>): VLMProviderConfig;
+/**
+ * Helper to create Surya OCR provider config
+ */
+declare function defineSuryaProvider(config: Omit<Extract<OCRProviderConfig, {
+    provider: 'surya';
+}>, 'type'>): OCRProviderConfig;
+/**
+ * Helper to create Marker OCR provider config
+ */
+declare function defineMarkerProvider(config: Omit<Extract<OCRProviderConfig, {
+    provider: 'marker';
+}>, 'type'>): OCRProviderConfig;
+/**
+ * Build a provider instance from config and secrets
+ *
+ * @param config - Provider configuration
+ * @param secrets - Provider secrets (API keys)
+ * @returns Provider instance
+ *
+ * @example
+ * ```typescript
+ * const config: VLMProviderConfig = {
+ *   type: 'vlm',
+ *   id: 'gemini-flash',
+ *   provider: 'google',
+ *   model: 'google/gemini-2.5-flash-preview-09-2025',
+ *   via: 'openrouter'
+ * };
+ *
+ * const secrets: ProviderSecrets = {
+ *   'gemini-flash': {
+ *     apiKey: process.env.OPENROUTER_API_KEY
+ *   }
+ * };
+ *
+ * const provider = await buildProviderFromConfig(config, secrets);
+ * ```
+ */
+declare function buildProviderFromConfig(config: ProviderConfig, secrets: ProviderSecrets): Promise<ProviderInstance>;
+/**
+ * Build multiple providers from configs
+ *
+ * @param configs - Array of provider configurations
+ * @param secrets - Provider secrets
+ * @returns Provider registry (map of IDs to instances)
+ *
+ * @example
+ * ```typescript
+ * const configs: ProviderConfig[] = [
+ *   { type: 'vlm', id: 'gemini', provider: 'google', model: '...', via: 'openrouter' },
+ *   { type: 'ocr', id: 'surya', provider: 'surya' }
+ * ];
+ *
+ * const secrets: ProviderSecrets = {
+ *   'gemini': { apiKey: process.env.OPENROUTER_API_KEY },
+ *   'surya': { apiKey: process.env.SURYA_API_KEY }
+ * };
+ *
+ * const providers = await buildProvidersFromConfigs(configs, secrets);
+ * // providers = { gemini: VLMProvider, surya: OCRProvider }
+ * ```
+ */
+declare function buildProvidersFromConfigs(configs: ProviderConfig[], secrets: ProviderSecrets): Promise<ProviderRegistry>;
+/**
+ * TypeScript utility types for auto-injected prompt variables
+ *
+ * These types document which variables are automatically injected by each node type,
+ * helping users understand what's available in their prompt templates.
+ */
+/**
+ * Variables auto-injected by the Extract node
+ */
+interface ExtractAutoVariables {
+    /**
+     * The JSON schema for extraction, from config.schema
+     */
+    schema: object;
+    /**
+     * The document text extracted from DocumentIR or FlowInput
+     */
+    documentText: string;
+    /**
+     * Schema title from schema.title or default value
+     * Default: "the provided schema"
+     */
+    schemaTitle: string;
+    /**
+     * Schema description from schema.description or empty string
+     */
+    schemaDescription: string;
+    /**
+     * Generated formatting instructions for markdown/html output
+     * Only present when using structured formats
+     */
+    structuredFormat?: string;
+}
+/**
+ * Variables auto-injected by the Categorize node
+ */
+interface CategorizeAutoVariables {
+    /**
+     * Array of available categories from config.categories
+     */
+    categories: string[];
+    /**
+     * The document text extracted from DocumentIR or FlowInput
+     */
+    documentText: string;
+}
+/**
+ * Variables auto-injected by the Parse node
+ */
+interface ParseAutoVariables {
+    /**
+     * Output format from config.format
+     * Default: 'text'
+     */
+    format: 'text' | 'markdown' | 'html';
+    /**
+     * Schema for structured parsing, if provided in config
+     */
+    schema?: object;
+    /**
+     * Whether to describe figures/charts/diagrams from config.describeFigures
+     * Default: false
+     */
+    describeFigures: boolean;
+    /**
+     * Whether citation tracking is enabled from config.citations?.enabled
+     */
+    citationsEnabled: boolean | undefined;
+}
+/**
+ * Union type of all auto-injected variables across all node types
+ */
+type AllAutoVariables = ExtractAutoVariables | CategorizeAutoVariables | ParseAutoVariables;
+/**
+ * Utility type to get the auto-injected variables for a specific node type
+ */
+type AutoVariablesForNode<T extends 'extract' | 'categorize' | 'parse'> = T extends 'extract' ? ExtractAutoVariables : T extends 'categorize' ? CategorizeAutoVariables : T extends 'parse' ? ParseAutoVariables : never;
+/**
+ * Helper type for custom promptVariables that combines auto-injected vars with user vars
+ */
+type PromptVariables<TNodeType extends 'extract' | 'categorize' | 'parse', TCustomVars extends Record<string, any> = {}> = AutoVariablesForNode<TNodeType> & TCustomVars;
+/**
+ * MIME Type Detection Utility
+ *
+ * Detects MIME types from actual file data (magic bytes) to prevent mismatches
+ * between declared MIME types and actual file content.
+ *
+ * Uses the `file-type` package for comprehensive format detection, with
+ * manual fallback for basic types in synchronous contexts.
+ */
+/**
+ * Detects MIME type from base64-encoded data using the file-type package.
+ * This is the preferred async method that supports 100+ file formats.
+ *
+ * @param base64Data - Base64 string (with or without data URI prefix)
+ * @returns Detected MIME type (e.g., "image/jpeg", "application/pdf")
+ * @throws Error if format is unsupported or data is invalid
+ *
+ * @example
+ * ```typescript
+ * const base64 = "data:image/jpeg;base64,/9j/4AAQSkZJRg...";
+ * const mimeType = await detectMimeTypeFromBase64Async(base64);
+ * console.log(mimeType); // "image/jpeg"
+ * ```
+ */
+declare function detectMimeTypeFromBase64Async(base64Data: string): Promise<string>;
+/**
+ * Detects MIME type from base64-encoded data by examining magic bytes.
+ * This is a synchronous fallback for basic formats.
+ *
+ * Supports:
+ * - Images: JPEG, PNG, WebP, GIF, TIFF, BMP
+ * - Documents: PDF, RTF
+ * - Archives: ZIP (for DOCX, XLSX, PPTX, EPUB detection via extension)
+ *
+ * @param base64Data - Base64 string (with or without data URI prefix)
+ * @returns Detected MIME type (e.g., "image/jpeg", "application/pdf")
+ * @throws Error if format is unsupported or data is invalid
+ *
+ * @example
+ * ```typescript
+ * const base64 = "data:image/jpeg;base64,/9j/4AAQSkZJRg...";
+ * const mimeType = detectMimeTypeFromBase64(base64);
+ * console.log(mimeType); // "image/jpeg"
+ * ```
+ */
+declare function detectMimeTypeFromBase64(base64Data: string): string;
+/**
+ * Detects MIME type from raw byte array.
+ *
+ * @param bytes - Uint8Array containing file data
+ * @returns Detected MIME type
+ * @throws Error if format is unsupported
+ */
+declare function detectMimeTypeFromBytes(bytes: Uint8Array): string;
+/**
+ * Validates that declared MIME type matches actual file data.
+ *
+ * @param base64Data - Base64 string (with or without data URI prefix)
+ * @param declaredMimeType - MIME type that was declared/expected
+ * @returns Object with validation result and actual MIME type
+ *
+ * @example
+ * ```typescript
+ * const result = validateMimeType(base64Data, "image/jpeg");
+ * if (!result.isValid) {
+ *   console.warn(`MIME mismatch: declared ${result.declaredMimeType}, actual ${result.actualMimeType}`);
+ * }
+ * ```
+ */
+declare function validateMimeType(base64Data: string, declaredMimeType: string): {
+    isValid: boolean;
+    actualMimeType: string;
+    declaredMimeType: string;
+};
+/**
+ * Async version of validateMimeType using file-type for comprehensive detection.
+ */
+declare function validateMimeTypeAsync(base64Data: string, declaredMimeType: string): Promise<{
+    isValid: boolean;
+    actualMimeType: string;
+    declaredMimeType: string;
+}>;
+/**
+ * Extracts base64 data from a data URI or returns the data as-is if already base64.
+ *
+ * @param data - Data URI or base64 string
+ * @returns Pure base64 string without prefix
+ *
+ * @example
+ * ```typescript
+ * extractBase64("data:image/jpeg;base64,/9j/4AAQ...") // "/9j/4AAQ..."
+ * extractBase64("/9j/4AAQ...") // "/9j/4AAQ..."
+ * ```
+ */
+declare function extractBase64(data: string): string;
+/**
+ * Unified Provider Query Interface
+ *
+ * Provides a unified way to query and filter provider metadata across
+ * all provider packages (@doclo/providers-llm, @doclo/providers-datalab).
+ *
+ * @example
+ * ```typescript
+ * import { queryProviders, registerProviderMetadata } from '@doclo/core';
+ *
+ * // Register metadata from provider packages (done automatically if packages are imported)
+ * import { PROVIDER_METADATA as LLM_METADATA } from '@doclo/providers-llm';
+ * import { PROVIDER_METADATA as DATALAB_METADATA } from '@doclo/providers-datalab';
+ *
+ * registerProviderMetadata('llm', LLM_METADATA);
+ * registerProviderMetadata('datalab', DATALAB_METADATA);
+ *
+ * // Query providers
+ * const pdfProviders = queryProviders({ supports: { pdfs: true } });
+ * const cheapProviders = queryProviders({ maxCostPerPage: 0.01 });
+ * const largeFileProviders = queryProviders({ minFileSize: 100 }); // 100 MB+
+ * ```
+ */
+/**
+ * Input type requirements for providers/models.
+ * More normalized than a boolean - allows for future extensibility.
+ *
+ * - 'raw-document': Needs FlowInput with base64/url (OCR/VLM providers like marker-vlm)
+ * - 'parsed-text': Needs DocumentIR text output from parse step (text-only processors)
+ * - 'any': Can work with either (most vision LLMs like GPT-4o, Claude with vision)
+ */
+type ProviderInputType = 'raw-document' | 'parsed-text' | 'any';
+/**
+ * Input requirements specification for a provider or model.
+ * Determines what form of document input is expected.
+ */
+type InputRequirements = {
+    /**
+     * What type of input this provider accepts.
+     * - 'raw-document': Needs PDF/image bytes directly (marker-vlm, reducto-extract)
+     * - 'parsed-text': Needs DocumentIR text (text-only processors)
+     * - 'any': Can work with either (vision LLMs like GPT-4o, Claude)
+     */
+    inputType: ProviderInputType;
+    /**
+     * Accepted input methods when inputType is 'raw-document'.
+     * Inherited from inputFormats.inputMethods if not specified.
+     */
+    acceptedMethods?: readonly ('url' | 'base64' | 'fileId')[];
+};
+/**
+ * Output format support flags
+ */
+type OutputFormatSupport = {
+    text: boolean;
+    markdown: boolean;
+    html: boolean;
+    json: boolean;
+};
+/**
+ * Normalized features across all providers.
+ * Maps provider-specific option names to unified names.
+ *
+ * This enables UIs to query "what features does this provider support?"
+ * and get a consistent answer across all providers.
+ */
+type NormalizedFeatures = {
+    /** Limit to first N pages */
+    maxPages: boolean;
+    /** Specific page range selection */
+    pageRange: boolean;
+    /** OCR language hints (maps from 'langs') */
+    languageHints: boolean;
+    /** Quality/speed modes (fast/balanced/high_accuracy) */
+    processingModes: boolean;
+    /** Reducto agentic mode (higher accuracy, more cost) */
+    agenticMode: boolean;
+    /** Custom prompts (maps from blockCorrectionPrompt, additionalPrompt, systemPrompt) */
+    customPrompts: boolean;
+    /** Extract embedded images (maps from extractImages, returnImages) */
+    imageExtraction: boolean;
+    /** Page delimiters (maps from paginate, addPageMarkers) */
+    pageMarkers: boolean;
+    /** Field-level citations with source references */
+    citations: boolean;
+    /** Document chunking modes (RAG-optimized) */
+    chunking: boolean;
+    /** Auto-segmentation for multi-document PDFs */
+    segmentation: boolean;
+    /** Re-run OCR on already-OCR'd documents */
+    stripExistingOCR: boolean;
+    /** Format lines in output */
+    formatLines: boolean;
+    /** Force OCR even if text layer exists */
+    forceOCR: boolean;
+    /** Table format options (html/json/md/csv) */
+    tableOutputFormats: boolean;
+    /** Merge consecutive tables */
+    tableMerging: boolean;
+    /** Block-level confidence scores */
+    confidence: boolean;
+    /** Bounding box coordinates for text/elements */
+    boundingBoxes: boolean;
+    /** JSON schema validation for structured output */
+    schemaValidation: boolean;
+    /** Handwritten text recognition support */
+    handwrittenText: boolean;
+    /** Supported output formats */
+    outputFormats: OutputFormatSupport;
+};
+type NormalizedProviderMetadata = {
+    id: string;
+    name: string;
+    source: 'llm' | 'datalab' | 'unsiloed' | 'reducto' | string;
+    type: 'LLM' | 'OCR' | 'VLM' | 'Split';
+    identity?: {
+        /** Provider vendor (company) */
+        provider: ProviderVendor | string;
+        /** Model identifier */
+        model: string;
+        /** Access method (native, openrouter, self-hosted) */
+        method?: AccessMethod;
+    };
+    capabilities: {
+        supportsImages: boolean;
+        supportsPDFs: boolean;
+        supportsDocuments: boolean;
+        supportsReasoning: boolean;
+        supportsStructuredOutput: boolean;
+        supportsPrompts: boolean;
+        supportsCitations: boolean;
+        supportsChunking: boolean;
+        supportsImageExtraction: boolean;
+        supportsPageMarkers: boolean;
+        supportsLanguageHints: boolean;
+        supportsProcessingModes: boolean;
+        supportsSegmentation: boolean;
+        outputFormats: OutputFormatSupport;
+    };
+    features: NormalizedFeatures;
+    inputRequirements: InputRequirements;
+    compatibleNodes: {
+        parse: boolean;
+        extract: boolean;
+        categorize: boolean;
+        qualify: boolean;
+        split: boolean;
+    };
+    inputFormats: {
+        imageMimeTypes: readonly string[];
+        documentMimeTypes: readonly string[];
+        inputMethods: readonly ('url' | 'base64' | 'fileId')[];
+        maxImageSize?: number;
+        maxPdfSize?: number;
+        maxFileSize?: number;
+        maxPages?: number;
+    };
+    pricing: {
+        model: 'per-token' | 'per-page';
+        inputPer1kTokens?: number;
+        outputPer1kTokens?: number;
+        perPage?: number;
+        currency: 'USD';
+        notes?: string;
+    };
+    rateLimits?: {
+        requestsPerMinute?: number;
+        docsPerMinute?: number;
+    };
+    raw: unknown;
+};
+/**
+ * Feature names that can be queried (excludes outputFormats which is nested)
+ */
+type FeatureName = Exclude<keyof NormalizedFeatures, 'outputFormats'>;
+type ProviderQueryFilter = {
+    source?: 'llm' | 'datalab' | 'unsiloed' | 'reducto' | string | string[];
+    type?: 'LLM' | 'OCR' | 'VLM' | 'Split' | ('LLM' | 'OCR' | 'VLM' | 'Split')[];
+    /** Filter by provider vendor (company) */
+    provider?: ProviderVendor | ProviderVendor[] | string | string[];
+    /** Filter by model ID (requires provider to be specified for best results) */
+    model?: string | string[];
+    /** Filter by access method */
+    method?: AccessMethod | AccessMethod[];
+    supports?: {
+        images?: boolean;
+        pdfs?: boolean;
+        documents?: boolean;
+        reasoning?: boolean;
+        structuredOutput?: boolean;
+        prompts?: boolean;
+        citations?: boolean;
+        chunking?: boolean;
+        imageExtraction?: boolean;
+        pageMarkers?: boolean;
+        languageHints?: boolean;
+        processingModes?: boolean;
+        segmentation?: boolean;
+    };
+    hasFeatures?: FeatureName[];
+    outputFormat?: 'text' | 'markdown' | 'html' | 'json';
+    inputRequirements?: {
+        /**
+         * Filter by input type requirement.
+         * - 'raw-document': Only providers that need raw document input
+         * - 'parsed-text': Only providers that need parsed text
+         * - 'any': Only providers that accept any input type
+         * - ['raw-document', 'any']: Providers that accept raw documents (raw-document OR any)
+         */
+        inputType?: ProviderInputType | ProviderInputType[];
+    };
+    compatibleWith?: ('parse' | 'extract' | 'categorize' | 'qualify' | 'split')[];
+    mimeType?: string | string[];
+    minFileSize?: number;
+    maxFileSize?: number;
+    maxCostPerPage?: number;
+    maxCostPer1kTokens?: number;
+    filter?: (provider: NormalizedProviderMetadata) => boolean;
+};
+/**
+ * Register provider metadata from a provider package
+ *
+ * @param source - Source identifier (e.g., 'llm', 'datalab')
+ * @param metadata - Raw metadata object from the provider package
+ * @param normalizer - Function to normalize the metadata
+ *
+ * @example
+ * ```typescript
+ * import { PROVIDER_METADATA } from '@doclo/providers-llm';
+ * registerProviderMetadata('llm', PROVIDER_METADATA, normalizeLLMMetadata);
+ * ```
+ */
+declare function registerProviderMetadata(source: string, metadata: Record<string, unknown>, normalizer?: (id: string, data: unknown, source: string) => NormalizedProviderMetadata): void;
+/**
+ * Get all registered providers (normalized)
+ */
+declare function getAllProviders(): NormalizedProviderMetadata[];
+/**
+ * Query providers with filters
+ *
+ * @param filter - Query filters
+ * @returns Array of matching providers
+ *
+ * @example
+ * ```typescript
+ * // Get all providers that support PDFs
+ * const pdfProviders = queryProviders({ supports: { pdfs: true } });
+ *
+ * // Get cheap OCR providers
+ * const cheapOcr = queryProviders({
+ *   type: 'OCR',
+ *   maxCostPerPage: 0.02
+ * });
+ *
+ * // Get providers that can handle large files
+ * const largeFileProviders = queryProviders({ minFileSize: 100 });
+ *
+ * // Get providers compatible with extract() node
+ * const extractProviders = queryProviders({
+ *   compatibleWith: ['extract']
+ * });
+ * ```
+ */
+declare function queryProviders(filter?: ProviderQueryFilter): NormalizedProviderMetadata[];
+/**
+ * Get a single provider by ID
+ */
+declare function getProviderById(id: string): NormalizedProviderMetadata | undefined;
+/**
+ * Get providers by source
+ */
+declare function getProvidersBySource(source: string): NormalizedProviderMetadata[];
+/**
+ * Clear all registered providers (useful for testing)
+ */
+declare function clearProviderRegistry(): void;
+/**
+ * Get providers that support a specific MIME type
+ */
+declare function getProvidersForMimeType(mimeType: string): NormalizedProviderMetadata[];
+/**
+ * Get the cheapest provider for a specific capability
+ */
+declare function getCheapestProviderFor(capability: 'ocr' | 'extraction' | 'parse'): NormalizedProviderMetadata | undefined;
+/**
+ * Get providers with the largest file size support
+ */
+declare function getProvidersForLargeFiles(minSizeMB?: number): NormalizedProviderMetadata[];
+/**
+ * Type alias for capabilities object (for model override typing)
+ */
+type NormalizedCapabilities = NormalizedProviderMetadata['capabilities'];
+/**
+ * Type alias for node compatibility object
+ */
+type NodeCompatibility = NormalizedProviderMetadata['compatibleNodes'];
+/**
+ * Type alias for pricing configuration
+ */
+type NormalizedPricing = NormalizedProviderMetadata['pricing'];
+/**
+ * Node type names for querying
+ */
+type NodeTypeName = 'parse' | 'extract' | 'categorize' | 'qualify' | 'split';
+/**
+ * Model-level limits that may differ from provider defaults
+ */
+type ModelLimits = {
+    maxContextTokens?: number;
+    maxOutputTokens?: number;
+    maxFileSize?: number;
+    maxPages?: number;
+};
+/**
+ * Model-level metadata that can override provider defaults.
+ * Unspecified fields inherit from the provider.
+ */
+type ModelMetadata = {
+    /** Model ID as used in API calls */
+    id: string;
+    /** Human-readable name (optional, defaults to id) */
+    name?: string;
+    /** OpenRouter model ID (e.g., 'openai/gpt-4.1') */
+    openRouterId?: string;
+    /** Override provider capabilities */
+    capabilities?: Partial<NormalizedCapabilities>;
+    /** Override provider input requirements */
+    inputRequirements?: Partial<InputRequirements>;
+    /** Override provider node compatibility */
+    compatibleNodes?: Partial<NodeCompatibility>;
+    /** Model-specific pricing */
+    pricing?: {
+        inputPer1kTokens?: number;
+        outputPer1kTokens?: number;
+        perPage?: number;
+    };
+    /** Model-specific limits */
+    limits?: ModelLimits;
+};
+/**
+ * Provider metadata extended with model array
+ */
+type ProviderMetadataWithModels = NormalizedProviderMetadata & {
+    /** Per-model metadata with override capabilities */
+    models?: ModelMetadata[];
+};
+/**
+ * Fully resolved model metadata (all inheritance applied)
+ */
+type ResolvedModelMetadata = {
+    modelId: string;
+    modelName: string;
+    openRouterId?: string;
+    providerId: string;
+    providerName: string;
+    providerSource: string;
+    capabilities: NormalizedCapabilities;
+    features: NormalizedFeatures;
+    inputRequirements: InputRequirements;
+    compatibleNodes: NodeCompatibility;
+    pricing: NormalizedPricing;
+    limits?: ModelLimits;
+};
+/**
+ * Filter options for model queries
+ */
+type ModelQueryFilter = {
+    /** Filter by provider ID */
+    providerId?: string | string[];
+    /** Filter by provider source */
+    source?: string | string[];
+    /** Filter by capabilities */
+    supports?: {
+        images?: boolean;
+        pdfs?: boolean;
+        documents?: boolean;
+        reasoning?: boolean;
+        structuredOutput?: boolean;
+        prompts?: boolean;
+        citations?: boolean;
+        chunking?: boolean;
+        imageExtraction?: boolean;
+        pageMarkers?: boolean;
+        languageHints?: boolean;
+        processingModes?: boolean;
+        segmentation?: boolean;
+    };
+    /** Filter by specific features (all must be supported) */
+    hasFeatures?: FeatureName[];
+    /** Filter by output format support */
+    outputFormat?: 'text' | 'markdown' | 'html' | 'json';
+    /** Filter by input requirements */
+    inputRequirements?: {
+        inputType?: ProviderInputType | ProviderInputType[];
+    };
+    /** Filter by node compatibility */
+    compatibleWith?: NodeTypeName[];
+    /** Filter by context window (minimum) */
+    minContextTokens?: number;
+    /** Custom filter function */
+    filter?: (model: ResolvedModelMetadata) => boolean;
+};
+/**
+ * Register provider metadata with model information
+ *
+ * @param providerId - Provider identifier
+ * @param metadata - Provider metadata with models array
+ */
+declare function registerProviderWithModels(providerId: string, metadata: ProviderMetadataWithModels): void;
+/**
+ * Resolve model metadata by applying inheritance from provider.
+ * Returns fully resolved metadata for a specific model.
+ *
+ * @param providerId - Provider ID (e.g., 'openai', 'anthropic')
+ * @param modelId - Model ID (e.g., 'gpt-4.1', 'claude-sonnet-4.5'). If not provided, returns provider defaults.
+ * @returns Resolved model metadata or undefined if not found
+ *
+ * @example
+ * ```typescript
+ * const gpt4 = resolveModelMetadata('openai', 'gpt-4.1');
+ * console.log(gpt4?.capabilities.supportsReasoning); // false
+ *
+ * const o3 = resolveModelMetadata('openai', 'o3');
+ * console.log(o3?.capabilities.supportsReasoning); // true
+ * ```
+ */
+declare function resolveModelMetadata(providerId: string, modelId?: string): ResolvedModelMetadata | undefined;
+/**
+ * Query models with filters.
+ * Returns all models that match the filter criteria.
+ *
+ * @param filter - Query filters
+ * @returns Array of matching resolved model metadata
+ *
+ * @example
+ * ```typescript
+ * // Get all reasoning models
+ * const reasoningModels = queryModels({ supports: { reasoning: true } });
+ *
+ * // Get models with large context windows
+ * const largeContextModels = queryModels({ minContextTokens: 100000 });
+ *
+ * // Get OpenAI models compatible with extract()
+ * const openaiExtract = queryModels({
+ *   providerId: 'openai',
+ *   compatibleWith: ['extract']
+ * });
+ * ```
+ */
+declare function queryModels(filter?: ModelQueryFilter): ResolvedModelMetadata[];
+/**
+ * Get all models compatible with a specific node type.
+ *
+ * @param nodeType - Node type to check compatibility
+ * @returns Array of resolved model metadata
+ *
+ * @example
+ * ```typescript
+ * // Get all models that can be used with extract()
+ * const extractModels = getModelsForNode('extract');
+ *
+ * // Get all models that can be used with parse()
+ * const parseModels = getModelsForNode('parse');
+ * ```
+ */
+declare function getModelsForNode(nodeType: NodeTypeName): ResolvedModelMetadata[];
+/**
+ * Get all registered models (resolved)
+ */
+declare function getAllModels(): ResolvedModelMetadata[];
+/**
+ * Clear model registry (useful for testing)
+ */
+declare function clearModelRegistry(): void;
+export { type AcceptedMimeType, AccessMethod, type AllAutoVariables, type AutoVariablesForNode, type BaseProviderConfig, type CategorizeAutoVariables, type DocumentMimeType, type ExtractAutoVariables, type FeatureName, FlowInputValidationError, type InputRequirements, type ModelMetadata, type ModelQueryFilter, type NormalizedCapabilities, type NormalizedFeatures, type NormalizedProviderMetadata, type OCRProviderConfig, type OutputFormatSupport, type ParseAutoVariables, type PromptVariables, type ProviderConfig, type ProviderInputType, type ProviderInstance, type ProviderMetadataWithModels, type ProviderQueryFilter, type ProviderRegistry, type ProviderSecrets, ProviderVendor, type ResolvedModelMetadata, type VLMProviderConfig, bufferToBase64, bufferToDataUri, buildProviderFromConfig, buildProvidersFromConfigs, clearModelRegistry, clearProviderRegistry, defineMarkerProvider, defineSuryaProvider, defineVLMProvider, detectDocumentType, detectMimeTypeFromBase64, detectMimeTypeFromBase64Async, detectMimeTypeFromBytes, extractBase64, getAllModels, getAllProviders, getCheapestProviderFor, getModelsForNode, getProviderById, getProvidersBySource, getProvidersForLargeFiles, getProvidersForMimeType, isPDFDocument, queryModels, queryProviders, registerProviderMetadata, registerProviderWithModels, resolveDocument, resolveModelMetadata, validateFlowInputFormat, validateMimeType, validateMimeTypeAsync };