npm - @doclo/providers-llm - Versions diffs - 0.1.7 → 0.1.9 - Mend

@doclo/providers-llm 0.1.7 → 0.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/dist/chunk-7YPJIWRM.js +291 -0
package/dist/chunk-7YPJIWRM.js.map +1 -0
package/dist/index.d.ts +275 -4
package/dist/index.js +317 -146
package/dist/index.js.map +1 -1
package/dist/schema-prompt-formatter-AIORLWUF.js +29 -0
package/dist/schema-prompt-formatter-AIORLWUF.js.map +1 -0
package/package.json +2 -2

package/dist/index.d.ts CHANGED Viewed

@@ -93,6 +93,7 @@ interface LLMResponse<T = unknown> {
     metrics: ResponseMetrics;
     reasoning?: string;
     reasoning_details?: ReasoningDetail[];
+    metadata?: LLMExtractedMetadata;
 }
 /** Provider capability flags */
 interface ProviderCapabilities {
@@ -106,6 +107,100 @@ interface ProviderCapabilities {
 }
 /** JSON output mode */
 type JsonMode = 'strict' | 'relaxed';
+/**
+ * LLM-derived feature options that are implemented via prompting
+ * These options are normalized across providers and work through prompt engineering
+ */
+interface LLMDerivedOptions {
+    /** Format for text output (markdown, html, json, text) */
+    outputFormat?: 'markdown' | 'html' | 'json' | 'text';
+    /** Format for tables within text fields */
+    tableFormat?: 'markdown' | 'html' | 'csv';
+    /** Add page break markers (---) between pages */
+    pageMarkers?: boolean;
+    /** Include per-field confidence scores (attached to result, not in JSON) */
+    includeConfidence?: boolean;
+    /** Include source citations with bounding boxes (attached to result, not in JSON) */
+    includeSources?: boolean;
+    /** Include block type classification for each extracted element */
+    includeBlockTypes?: boolean;
+    /** Extract document headers (repeated content at top of pages) */
+    extractHeaders?: boolean;
+    /** Extract document footers (repeated content at bottom of pages) */
+    extractFooters?: boolean;
+    /** Document chunking strategy */
+    chunkingStrategy?: 'page' | 'section' | 'paragraph' | 'semantic';
+    /** Maximum chunk size in characters (when using chunking) */
+    maxChunkSize?: number;
+    /** Language hints for the document */
+    languageHints?: string[];
+    /**
+     * Normalize date fields to ISO 8601 format (YYYY-MM-DD)
+     * When enabled, date fields in the extraction output will be formatted consistently.
+     * Native support: Extend.ai (extend:type: "date")
+     * LLM support: Via prompting
+     */
+    dateNormalization?: boolean;
+    /**
+     * Normalize currency fields to { amount: number, currency: string } objects
+     * When enabled, monetary values are extracted as structured objects with ISO 4217 currency codes.
+     * Native support: Extend.ai (extend:type: "currency")
+     * LLM support: Via prompting
+     */
+    currencyNormalization?: boolean;
+    /**
+     * Detect and extract signature fields from documents
+     * When enabled, signature presence is detected and locations are reported.
+     * Native support: Extend.ai (extend:type: "signature"), Reducto
+     * LLM support: Via prompting (less reliable)
+     */
+    signatureDetection?: boolean;
+}
+/**
+ * Extracted metadata from LLM response (populated when derived options are enabled)
+ */
+interface LLMExtractedMetadata {
+    /** Per-field confidence scores (0-1) */
+    confidence?: Record<string, number>;
+    /** Source citations with bounding boxes */
+    sources?: Array<{
+        field: string;
+        text: string;
+        bbox?: [number, number, number, number];
+        page?: number;
+    }>;
+    /** Block type classifications */
+    blockTypes?: Record<string, string>;
+    /** Extracted headers */
+    headers?: Array<{
+        text: string;
+        pages: number[];
+    }>;
+    /** Extracted footers */
+    footers?: Array<{
+        text: string;
+        pages: number[];
+    }>;
+    /** Detected signatures with location and confidence */
+    signatures?: Array<{
+        field: string;
+        detected: boolean;
+        bbox?: [number, number, number, number];
+        page?: number;
+        confidence?: number;
+    }>;
+    /** Normalized currency values (original → normalized mapping) */
+    normalizedCurrencies?: Record<string, {
+        original: string;
+        amount: number;
+        currency: string;
+    }>;
+    /** Normalized date values (original → normalized mapping) */
+    normalizedDates?: Record<string, {
+        original: string;
+        normalized: string;
+    }>;
+}
 /** Provider interface */
 interface LLMProvider {
     readonly name: string;
@@ -117,6 +212,7 @@ interface LLMProvider {
         max_tokens?: number;
         reasoning?: ReasoningConfig;
         embedSchemaInPrompt?: boolean;
+        derivedOptions?: LLMDerivedOptions;
     }): Promise<LLMResponse<T>>;
 }
 /** Reasoning configuration (normalized across providers) */
@@ -263,6 +359,82 @@ declare function buildSchemaPromptSection(schema: JSONSchema): string;
  * Combines schema prompt section with user's custom prompt
  */
 declare function combineSchemaAndUserPrompt(schema: JSONSchema, userPrompt: string): string;
+/**
+ * Output format types for LLM text generation
+ */
+type OutputFormat = 'markdown' | 'html' | 'json' | 'text';
+type TableFormat = 'markdown' | 'html' | 'csv';
+type ChunkingStrategy = 'page' | 'section' | 'paragraph' | 'semantic';
+/**
+ * Options for LLM-derived features that are implemented via prompting
+ */
+interface LLMDerivedPromptOptions {
+    outputFormat?: OutputFormat;
+    tableFormat?: TableFormat;
+    pageMarkers?: boolean;
+    includeConfidence?: boolean;
+    includeSources?: boolean;
+    includeBlockTypes?: boolean;
+    extractHeaders?: boolean;
+    extractFooters?: boolean;
+    chunkingStrategy?: ChunkingStrategy;
+    maxChunkSize?: number;
+    languageHints?: string[];
+}
+/**
+ * Builds prompt additions for output format options
+ */
+declare function buildOutputFormatPrompt(options: LLMDerivedPromptOptions): string;
+/**
+ * Builds prompt additions for language hints
+ */
+declare function buildLanguageHintsPrompt(languages: string[]): string;
+/**
+ * Builds prompt additions for confidence scoring
+ */
+declare function buildConfidencePrompt(): string;
+/**
+ * Builds prompt additions for source citations with bounding boxes
+ */
+declare function buildSourcesPrompt(): string;
+/**
+ * Builds prompt additions for block type classification
+ */
+declare function buildBlockClassificationPrompt(): string;
+/**
+ * Combines all LLM-derived feature prompts into a single prompt section
+ */
+declare function buildLLMDerivedFeaturesPrompt(options: LLMDerivedPromptOptions): string;
+/**
+ * Combines schema prompt with user prompt and LLM-derived features
+ */
+declare function combineSchemaUserAndDerivedPrompts(schema: JSONSchema, userPrompt: string, derivedOptions?: LLMDerivedPromptOptions): string;
+/**
+ * Utility for extracting metadata from LLM responses
+ * Handles the `_` prefixed fields that contain confidence, sources, etc.
+ */
+/**
+ * Extracts metadata fields from a JSON response and returns clean JSON + metadata
+ *
+ * @param json - The raw JSON response from the LLM (may contain _ prefixed fields)
+ * @returns Object with clean JSON (metadata removed) and extracted metadata
+ */
+declare function extractMetadataFromResponse<T>(json: unknown): {
+    json: T;
+    metadata?: LLMExtractedMetadata;
+};
+/**
+ * Checks if derived options require metadata extraction
+ */
+declare function shouldExtractMetadata(derivedOptions?: {
+    includeConfidence?: boolean;
+    includeSources?: boolean;
+    includeBlockTypes?: boolean;
+    extractHeaders?: boolean;
+    extractFooters?: boolean;
+}): boolean;
 /**
  * Factory function type for creating provider instances
@@ -325,12 +497,14 @@ declare class OpenAIProvider implements LLMProvider {
     private limits;
     constructor(config: ProviderConfig);
     completeJson<T>(params: {
-        input: MultimodalInput;
+        input?: MultimodalInput;
+        prompt?: MultimodalInput | string;
         schema?: UnifiedSchema<T>;
         mode?: JsonMode;
         max_tokens?: number;
         reasoning?: ReasoningConfig;
         embedSchemaInPrompt?: boolean;
+        derivedOptions?: LLMDerivedOptions;
     }): Promise<LLMResponse<T>>;
     private buildReasoningConfig;
     private buildMessages;
@@ -349,12 +523,14 @@ declare class AnthropicProvider implements LLMProvider {
     private limits;
     constructor(config: ProviderConfig);
     completeJson<T>(params: {
-        input: MultimodalInput;
+        input?: MultimodalInput;
+        prompt?: MultimodalInput | string;
         schema?: UnifiedSchema<T>;
         mode?: JsonMode;
         max_tokens?: number;
         reasoning?: ReasoningConfig;
         embedSchemaInPrompt?: boolean;
+        derivedOptions?: LLMDerivedOptions;
     }): Promise<LLMResponse<T>>;
     private buildNativeThinkingConfig;
     private translateToOpenRouterFormat;
@@ -394,6 +570,7 @@ declare class GoogleProvider implements LLMProvider {
         max_tokens?: number;
         reasoning?: ReasoningConfig;
         embedSchemaInPrompt?: boolean;
+        derivedOptions?: LLMDerivedOptions;
     }): Promise<LLMResponse<T>>;
     private buildNativeThinkingConfig;
     private translateToOpenRouterFormat;
@@ -415,12 +592,14 @@ declare class XAIProvider implements LLMProvider {
     private limits;
     constructor(config: ProviderConfig);
     completeJson<T>(params: {
-        input: MultimodalInput;
+        input?: MultimodalInput;
+        prompt?: MultimodalInput | string;
         schema?: UnifiedSchema<T>;
         mode?: JsonMode;
         max_tokens?: number;
         reasoning?: ReasoningConfig;
         embedSchemaInPrompt?: boolean;
+        derivedOptions?: LLMDerivedOptions;
     }): Promise<LLMResponse<T>>;
     private buildReasoningConfig;
     private buildMessages;
@@ -456,6 +635,98 @@ declare class FallbackManager {
  */
 declare function adaptToCoreLLMProvider(provider: LLMProvider): LLMJsonProvider;
+/**
+ * Schema for Gemini bounding box detection
+ * Used for OCR-style parsing with spatial information
+ *
+ * Note: Gemini uses [y_min, x_min, y_max, x_max] coordinate order (Y first, not X!)
+ * Coordinates are normalized to 0-1000 (divide by 1000, multiply by image dimensions)
+ */
+/**
+ * Block types for document structure classification
+ */
+declare const BLOCK_TYPES: readonly ["title", "paragraph", "table", "list", "header", "footer", "caption", "code", "image", "form", "signature", "handwriting"];
+type BlockType = typeof BLOCK_TYPES[number];
+/**
+ * Single text block with bounding box
+ */
+interface GeminiBoundingBoxBlock {
+    /**
+     * Bounding box coordinates: [y_min, x_min, y_max, x_max]
+     * Normalized to 0-1000 (Gemini format)
+     */
+    box_2d: [number, number, number, number];
+    /**
+     * Text content within the bounding box
+     */
+    text: string;
+    /**
+     * Block type classification
+     */
+    type: BlockType;
+    /**
+     * Confidence level (optional)
+     */
+    confidence?: 'high' | 'medium' | 'low';
+    /**
+     * Page number (0-indexed, for multi-page documents)
+     */
+    page?: number;
+}
+/**
+ * JSON Schema for Gemini bounding box extraction
+ * This schema is used with Gemini models to extract text with spatial information
+ */
+declare const geminiBoundingBoxSchema: UnifiedSchema<GeminiBoundingBoxBlock[]>;
+/**
+ * Prompt for Gemini bounding box extraction
+ * This activates Gemini's spatial understanding capabilities
+ */
+declare const GEMINI_BBOX_EXTRACTION_PROMPT = "Analyze this document and extract all text with precise bounding box locations.\n\nFor each text block, provide:\n- box_2d: Bounding box as [y_min, x_min, y_max, x_max] normalized to 0-1000\n- text: The exact text content\n- type: Block classification (title, paragraph, table, list, header, footer, caption, code, image, form, signature, handwriting)\n- confidence: Your confidence level (high, medium, low)\n- page: Page number (0-indexed) for multi-page documents\n\nIMPORTANT coordinate format:\n- Use [y_min, x_min, y_max, x_max] order (Y coordinate first, then X)\n- Normalize all values to 0-1000 range (top-left is [0, 0], bottom-right is [1000, 1000])\n\nReturn ONLY a valid JSON array, no other text.";
+/**
+ * Normalized bounding box format (0-1 range)
+ * This is the SDK's standard format after conversion from Gemini's 0-1000 format
+ */
+interface NormalizedBBox {
+    x: number;
+    y: number;
+    width: number;
+    height: number;
+}
+/**
+ * Convert Gemini 0-1000 coordinates to normalized 0-1 format
+ * Note: Gemini uses [y_min, x_min, y_max, x_max] order
+ *
+ * @param geminiBBox - Bounding box from Gemini [y_min, x_min, y_max, x_max] (0-1000)
+ * @returns Normalized bounding box with x, y, width, height (0-1)
+ */
+declare function normalizeGeminiBBox(geminiBBox: [number, number, number, number]): NormalizedBBox;
+/**
+ * Convert normalized 0-1 format back to Gemini 0-1000 coordinates
+ *
+ * @param bbox - Normalized bounding box (0-1)
+ * @returns Gemini format [y_min, x_min, y_max, x_max] (0-1000)
+ */
+declare function toGeminiBBox(bbox: NormalizedBBox): [number, number, number, number];
+/**
+ * Convert Gemini bounding box block to DocumentIR-compatible format
+ */
+interface DocumentBlock {
+    text: string;
+    bbox: NormalizedBBox;
+    type: BlockType;
+    confidence?: number;
+    page?: number;
+}
+/**
+ * Convert Gemini extraction result to DocumentIR blocks
+ *
+ * @param geminiBlocks - Raw blocks from Gemini extraction
+ * @returns Document blocks with normalized coordinates
+ */
+declare function convertGeminiBlocksToDocumentBlocks(geminiBlocks: GeminiBoundingBoxBlock[]): DocumentBlock[];
 /**
  * LLM Provider Metadata
  *
@@ -1417,4 +1688,4 @@ declare function createVLMProvider(config: {
  */
 declare function buildLLMProvider(config: FallbackConfig): VLMProvider;
-export { type AccessMethod, AnthropicProvider, type CircuitBreakerState, type FallbackConfig, FallbackManager, GoogleProvider, type ImageInput, type JsonMode, type LLMModelMetadata, type LLMProvider, type LLMProviderMetadata, type LLMProviderType, type LLMResponse, type MultimodalInput, type NodeType, OpenAIProvider, type PDFInput, PROVIDER_METADATA, type ProviderCapabilities, type ProviderConfig, type ProviderFactory, type ProviderInputType, type ProviderType, type ReasoningConfig, type ReasoningDetail, type ResourceLimits, type ResponseMetrics, SUPPORTED_IMAGE_TYPES, SchemaTranslator, type SupportedImageMimeType, type UnifiedSchema, XAIProvider, adaptToCoreLLMProvider, buildLLMProvider, buildSchemaPromptSection, combineSchemaAndUserPrompt, compareNativeVsOpenRouter, createProviderFromRegistry, createVLMProvider, estimateCost, formatSchemaForPrompt, getCheapestProvider, getProvidersForNode, isImageTypeSupported, isProviderCompatibleWithNode, providerRegistry, registerProvider, supportsPDFsInline };
+export { type AccessMethod, AnthropicProvider, BLOCK_TYPES, type BlockType, type CircuitBreakerState, type DocumentBlock, type FallbackConfig, FallbackManager, GEMINI_BBOX_EXTRACTION_PROMPT, type GeminiBoundingBoxBlock, GoogleProvider, type ImageInput, type JsonMode, type LLMDerivedOptions, type LLMExtractedMetadata, type LLMModelMetadata, type LLMProvider, type LLMProviderMetadata, type LLMProviderType, type LLMResponse, type MultimodalInput, type NodeType, type NormalizedBBox, OpenAIProvider, type PDFInput, PROVIDER_METADATA, type ProviderCapabilities, type ProviderConfig, type ProviderFactory, type ProviderInputType, type ProviderType, type ReasoningConfig, type ReasoningDetail, type ResourceLimits, type ResponseMetrics, SUPPORTED_IMAGE_TYPES, SchemaTranslator, type SupportedImageMimeType, type UnifiedSchema, XAIProvider, adaptToCoreLLMProvider, buildBlockClassificationPrompt, buildConfidencePrompt, buildLLMDerivedFeaturesPrompt, buildLLMProvider, buildLanguageHintsPrompt, buildOutputFormatPrompt, buildSchemaPromptSection, buildSourcesPrompt, combineSchemaAndUserPrompt, combineSchemaUserAndDerivedPrompts, compareNativeVsOpenRouter, convertGeminiBlocksToDocumentBlocks, createProviderFromRegistry, createVLMProvider, estimateCost, extractMetadataFromResponse, formatSchemaForPrompt, geminiBoundingBoxSchema, getCheapestProvider, getProvidersForNode, isImageTypeSupported, isProviderCompatibleWithNode, normalizeGeminiBBox, providerRegistry, registerProvider, shouldExtractMetadata, supportsPDFsInline, toGeminiBBox };