@doclo/providers-llm 0.1.7 → 0.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -93,6 +93,7 @@ interface LLMResponse<T = unknown> {
93
93
  metrics: ResponseMetrics;
94
94
  reasoning?: string;
95
95
  reasoning_details?: ReasoningDetail[];
96
+ metadata?: LLMExtractedMetadata;
96
97
  }
97
98
  /** Provider capability flags */
98
99
  interface ProviderCapabilities {
@@ -106,6 +107,100 @@ interface ProviderCapabilities {
106
107
  }
107
108
  /** JSON output mode */
108
109
  type JsonMode = 'strict' | 'relaxed';
110
+ /**
111
+ * LLM-derived feature options that are implemented via prompting
112
+ * These options are normalized across providers and work through prompt engineering
113
+ */
114
+ interface LLMDerivedOptions {
115
+ /** Format for text output (markdown, html, json, text) */
116
+ outputFormat?: 'markdown' | 'html' | 'json' | 'text';
117
+ /** Format for tables within text fields */
118
+ tableFormat?: 'markdown' | 'html' | 'csv';
119
+ /** Add page break markers (---) between pages */
120
+ pageMarkers?: boolean;
121
+ /** Include per-field confidence scores (attached to result, not in JSON) */
122
+ includeConfidence?: boolean;
123
+ /** Include source citations with bounding boxes (attached to result, not in JSON) */
124
+ includeSources?: boolean;
125
+ /** Include block type classification for each extracted element */
126
+ includeBlockTypes?: boolean;
127
+ /** Extract document headers (repeated content at top of pages) */
128
+ extractHeaders?: boolean;
129
+ /** Extract document footers (repeated content at bottom of pages) */
130
+ extractFooters?: boolean;
131
+ /** Document chunking strategy */
132
+ chunkingStrategy?: 'page' | 'section' | 'paragraph' | 'semantic';
133
+ /** Maximum chunk size in characters (when using chunking) */
134
+ maxChunkSize?: number;
135
+ /** Language hints for the document */
136
+ languageHints?: string[];
137
+ /**
138
+ * Normalize date fields to ISO 8601 format (YYYY-MM-DD)
139
+ * When enabled, date fields in the extraction output will be formatted consistently.
140
+ * Native support: Extend.ai (extend:type: "date")
141
+ * LLM support: Via prompting
142
+ */
143
+ dateNormalization?: boolean;
144
+ /**
145
+ * Normalize currency fields to { amount: number, currency: string } objects
146
+ * When enabled, monetary values are extracted as structured objects with ISO 4217 currency codes.
147
+ * Native support: Extend.ai (extend:type: "currency")
148
+ * LLM support: Via prompting
149
+ */
150
+ currencyNormalization?: boolean;
151
+ /**
152
+ * Detect and extract signature fields from documents
153
+ * When enabled, signature presence is detected and locations are reported.
154
+ * Native support: Extend.ai (extend:type: "signature"), Reducto
155
+ * LLM support: Via prompting (less reliable)
156
+ */
157
+ signatureDetection?: boolean;
158
+ }
159
+ /**
160
+ * Extracted metadata from LLM response (populated when derived options are enabled)
161
+ */
162
+ interface LLMExtractedMetadata {
163
+ /** Per-field confidence scores (0-1) */
164
+ confidence?: Record<string, number>;
165
+ /** Source citations with bounding boxes */
166
+ sources?: Array<{
167
+ field: string;
168
+ text: string;
169
+ bbox?: [number, number, number, number];
170
+ page?: number;
171
+ }>;
172
+ /** Block type classifications */
173
+ blockTypes?: Record<string, string>;
174
+ /** Extracted headers */
175
+ headers?: Array<{
176
+ text: string;
177
+ pages: number[];
178
+ }>;
179
+ /** Extracted footers */
180
+ footers?: Array<{
181
+ text: string;
182
+ pages: number[];
183
+ }>;
184
+ /** Detected signatures with location and confidence */
185
+ signatures?: Array<{
186
+ field: string;
187
+ detected: boolean;
188
+ bbox?: [number, number, number, number];
189
+ page?: number;
190
+ confidence?: number;
191
+ }>;
192
+ /** Normalized currency values (original → normalized mapping) */
193
+ normalizedCurrencies?: Record<string, {
194
+ original: string;
195
+ amount: number;
196
+ currency: string;
197
+ }>;
198
+ /** Normalized date values (original → normalized mapping) */
199
+ normalizedDates?: Record<string, {
200
+ original: string;
201
+ normalized: string;
202
+ }>;
203
+ }
109
204
  /** Provider interface */
110
205
  interface LLMProvider {
111
206
  readonly name: string;
@@ -117,6 +212,7 @@ interface LLMProvider {
117
212
  max_tokens?: number;
118
213
  reasoning?: ReasoningConfig;
119
214
  embedSchemaInPrompt?: boolean;
215
+ derivedOptions?: LLMDerivedOptions;
120
216
  }): Promise<LLMResponse<T>>;
121
217
  }
122
218
  /** Reasoning configuration (normalized across providers) */
@@ -263,6 +359,82 @@ declare function buildSchemaPromptSection(schema: JSONSchema): string;
263
359
  * Combines schema prompt section with user's custom prompt
264
360
  */
265
361
  declare function combineSchemaAndUserPrompt(schema: JSONSchema, userPrompt: string): string;
362
+ /**
363
+ * Output format types for LLM text generation
364
+ */
365
+ type OutputFormat = 'markdown' | 'html' | 'json' | 'text';
366
+ type TableFormat = 'markdown' | 'html' | 'csv';
367
+ type ChunkingStrategy = 'page' | 'section' | 'paragraph' | 'semantic';
368
+ /**
369
+ * Options for LLM-derived features that are implemented via prompting
370
+ */
371
+ interface LLMDerivedPromptOptions {
372
+ outputFormat?: OutputFormat;
373
+ tableFormat?: TableFormat;
374
+ pageMarkers?: boolean;
375
+ includeConfidence?: boolean;
376
+ includeSources?: boolean;
377
+ includeBlockTypes?: boolean;
378
+ extractHeaders?: boolean;
379
+ extractFooters?: boolean;
380
+ chunkingStrategy?: ChunkingStrategy;
381
+ maxChunkSize?: number;
382
+ languageHints?: string[];
383
+ }
384
+ /**
385
+ * Builds prompt additions for output format options
386
+ */
387
+ declare function buildOutputFormatPrompt(options: LLMDerivedPromptOptions): string;
388
+ /**
389
+ * Builds prompt additions for language hints
390
+ */
391
+ declare function buildLanguageHintsPrompt(languages: string[]): string;
392
+ /**
393
+ * Builds prompt additions for confidence scoring
394
+ */
395
+ declare function buildConfidencePrompt(): string;
396
+ /**
397
+ * Builds prompt additions for source citations with bounding boxes
398
+ */
399
+ declare function buildSourcesPrompt(): string;
400
+ /**
401
+ * Builds prompt additions for block type classification
402
+ */
403
+ declare function buildBlockClassificationPrompt(): string;
404
+ /**
405
+ * Combines all LLM-derived feature prompts into a single prompt section
406
+ */
407
+ declare function buildLLMDerivedFeaturesPrompt(options: LLMDerivedPromptOptions): string;
408
+ /**
409
+ * Combines schema prompt with user prompt and LLM-derived features
410
+ */
411
+ declare function combineSchemaUserAndDerivedPrompts(schema: JSONSchema, userPrompt: string, derivedOptions?: LLMDerivedPromptOptions): string;
412
+
413
+ /**
414
+ * Utility for extracting metadata from LLM responses
415
+ * Handles the `_` prefixed fields that contain confidence, sources, etc.
416
+ */
417
+
418
+ /**
419
+ * Extracts metadata fields from a JSON response and returns clean JSON + metadata
420
+ *
421
+ * @param json - The raw JSON response from the LLM (may contain _ prefixed fields)
422
+ * @returns Object with clean JSON (metadata removed) and extracted metadata
423
+ */
424
+ declare function extractMetadataFromResponse<T>(json: unknown): {
425
+ json: T;
426
+ metadata?: LLMExtractedMetadata;
427
+ };
428
+ /**
429
+ * Checks if derived options require metadata extraction
430
+ */
431
+ declare function shouldExtractMetadata(derivedOptions?: {
432
+ includeConfidence?: boolean;
433
+ includeSources?: boolean;
434
+ includeBlockTypes?: boolean;
435
+ extractHeaders?: boolean;
436
+ extractFooters?: boolean;
437
+ }): boolean;
266
438
 
267
439
  /**
268
440
  * Factory function type for creating provider instances
@@ -325,12 +497,14 @@ declare class OpenAIProvider implements LLMProvider {
325
497
  private limits;
326
498
  constructor(config: ProviderConfig);
327
499
  completeJson<T>(params: {
328
- input: MultimodalInput;
500
+ input?: MultimodalInput;
501
+ prompt?: MultimodalInput | string;
329
502
  schema?: UnifiedSchema<T>;
330
503
  mode?: JsonMode;
331
504
  max_tokens?: number;
332
505
  reasoning?: ReasoningConfig;
333
506
  embedSchemaInPrompt?: boolean;
507
+ derivedOptions?: LLMDerivedOptions;
334
508
  }): Promise<LLMResponse<T>>;
335
509
  private buildReasoningConfig;
336
510
  private buildMessages;
@@ -349,12 +523,14 @@ declare class AnthropicProvider implements LLMProvider {
349
523
  private limits;
350
524
  constructor(config: ProviderConfig);
351
525
  completeJson<T>(params: {
352
- input: MultimodalInput;
526
+ input?: MultimodalInput;
527
+ prompt?: MultimodalInput | string;
353
528
  schema?: UnifiedSchema<T>;
354
529
  mode?: JsonMode;
355
530
  max_tokens?: number;
356
531
  reasoning?: ReasoningConfig;
357
532
  embedSchemaInPrompt?: boolean;
533
+ derivedOptions?: LLMDerivedOptions;
358
534
  }): Promise<LLMResponse<T>>;
359
535
  private buildNativeThinkingConfig;
360
536
  private translateToOpenRouterFormat;
@@ -394,6 +570,7 @@ declare class GoogleProvider implements LLMProvider {
394
570
  max_tokens?: number;
395
571
  reasoning?: ReasoningConfig;
396
572
  embedSchemaInPrompt?: boolean;
573
+ derivedOptions?: LLMDerivedOptions;
397
574
  }): Promise<LLMResponse<T>>;
398
575
  private buildNativeThinkingConfig;
399
576
  private translateToOpenRouterFormat;
@@ -415,12 +592,14 @@ declare class XAIProvider implements LLMProvider {
415
592
  private limits;
416
593
  constructor(config: ProviderConfig);
417
594
  completeJson<T>(params: {
418
- input: MultimodalInput;
595
+ input?: MultimodalInput;
596
+ prompt?: MultimodalInput | string;
419
597
  schema?: UnifiedSchema<T>;
420
598
  mode?: JsonMode;
421
599
  max_tokens?: number;
422
600
  reasoning?: ReasoningConfig;
423
601
  embedSchemaInPrompt?: boolean;
602
+ derivedOptions?: LLMDerivedOptions;
424
603
  }): Promise<LLMResponse<T>>;
425
604
  private buildReasoningConfig;
426
605
  private buildMessages;
@@ -456,6 +635,98 @@ declare class FallbackManager {
456
635
  */
457
636
  declare function adaptToCoreLLMProvider(provider: LLMProvider): LLMJsonProvider;
458
637
 
638
+ /**
639
+ * Schema for Gemini bounding box detection
640
+ * Used for OCR-style parsing with spatial information
641
+ *
642
+ * Note: Gemini uses [y_min, x_min, y_max, x_max] coordinate order (Y first, not X!)
643
+ * Coordinates are normalized to 0-1000 (divide by 1000, multiply by image dimensions)
644
+ */
645
+
646
+ /**
647
+ * Block types for document structure classification
648
+ */
649
+ declare const BLOCK_TYPES: readonly ["title", "paragraph", "table", "list", "header", "footer", "caption", "code", "image", "form", "signature", "handwriting"];
650
+ type BlockType = typeof BLOCK_TYPES[number];
651
+ /**
652
+ * Single text block with bounding box
653
+ */
654
+ interface GeminiBoundingBoxBlock {
655
+ /**
656
+ * Bounding box coordinates: [y_min, x_min, y_max, x_max]
657
+ * Normalized to 0-1000 (Gemini format)
658
+ */
659
+ box_2d: [number, number, number, number];
660
+ /**
661
+ * Text content within the bounding box
662
+ */
663
+ text: string;
664
+ /**
665
+ * Block type classification
666
+ */
667
+ type: BlockType;
668
+ /**
669
+ * Confidence level (optional)
670
+ */
671
+ confidence?: 'high' | 'medium' | 'low';
672
+ /**
673
+ * Page number (0-indexed, for multi-page documents)
674
+ */
675
+ page?: number;
676
+ }
677
+ /**
678
+ * JSON Schema for Gemini bounding box extraction
679
+ * This schema is used with Gemini models to extract text with spatial information
680
+ */
681
+ declare const geminiBoundingBoxSchema: UnifiedSchema<GeminiBoundingBoxBlock[]>;
682
+ /**
683
+ * Prompt for Gemini bounding box extraction
684
+ * This activates Gemini's spatial understanding capabilities
685
+ */
686
+ declare const GEMINI_BBOX_EXTRACTION_PROMPT = "Analyze this document and extract all text with precise bounding box locations.\n\nFor each text block, provide:\n- box_2d: Bounding box as [y_min, x_min, y_max, x_max] normalized to 0-1000\n- text: The exact text content\n- type: Block classification (title, paragraph, table, list, header, footer, caption, code, image, form, signature, handwriting)\n- confidence: Your confidence level (high, medium, low)\n- page: Page number (0-indexed) for multi-page documents\n\nIMPORTANT coordinate format:\n- Use [y_min, x_min, y_max, x_max] order (Y coordinate first, then X)\n- Normalize all values to 0-1000 range (top-left is [0, 0], bottom-right is [1000, 1000])\n\nReturn ONLY a valid JSON array, no other text.";
687
+ /**
688
+ * Normalized bounding box format (0-1 range)
689
+ * This is the SDK's standard format after conversion from Gemini's 0-1000 format
690
+ */
691
+ interface NormalizedBBox {
692
+ x: number;
693
+ y: number;
694
+ width: number;
695
+ height: number;
696
+ }
697
+ /**
698
+ * Convert Gemini 0-1000 coordinates to normalized 0-1 format
699
+ * Note: Gemini uses [y_min, x_min, y_max, x_max] order
700
+ *
701
+ * @param geminiBBox - Bounding box from Gemini [y_min, x_min, y_max, x_max] (0-1000)
702
+ * @returns Normalized bounding box with x, y, width, height (0-1)
703
+ */
704
+ declare function normalizeGeminiBBox(geminiBBox: [number, number, number, number]): NormalizedBBox;
705
+ /**
706
+ * Convert normalized 0-1 format back to Gemini 0-1000 coordinates
707
+ *
708
+ * @param bbox - Normalized bounding box (0-1)
709
+ * @returns Gemini format [y_min, x_min, y_max, x_max] (0-1000)
710
+ */
711
+ declare function toGeminiBBox(bbox: NormalizedBBox): [number, number, number, number];
712
+ /**
713
+ * Convert Gemini bounding box block to DocumentIR-compatible format
714
+ */
715
+ interface DocumentBlock {
716
+ text: string;
717
+ bbox: NormalizedBBox;
718
+ type: BlockType;
719
+ confidence?: number;
720
+ page?: number;
721
+ }
722
+ /**
723
+ * Convert Gemini extraction result to DocumentIR blocks
724
+ *
725
+ * @param geminiBlocks - Raw blocks from Gemini extraction
726
+ * @returns Document blocks with normalized coordinates
727
+ */
728
+ declare function convertGeminiBlocksToDocumentBlocks(geminiBlocks: GeminiBoundingBoxBlock[]): DocumentBlock[];
729
+
459
730
  /**
460
731
  * LLM Provider Metadata
461
732
  *
@@ -1417,4 +1688,4 @@ declare function createVLMProvider(config: {
1417
1688
  */
1418
1689
  declare function buildLLMProvider(config: FallbackConfig): VLMProvider;
1419
1690
 
1420
- export { type AccessMethod, AnthropicProvider, type CircuitBreakerState, type FallbackConfig, FallbackManager, GoogleProvider, type ImageInput, type JsonMode, type LLMModelMetadata, type LLMProvider, type LLMProviderMetadata, type LLMProviderType, type LLMResponse, type MultimodalInput, type NodeType, OpenAIProvider, type PDFInput, PROVIDER_METADATA, type ProviderCapabilities, type ProviderConfig, type ProviderFactory, type ProviderInputType, type ProviderType, type ReasoningConfig, type ReasoningDetail, type ResourceLimits, type ResponseMetrics, SUPPORTED_IMAGE_TYPES, SchemaTranslator, type SupportedImageMimeType, type UnifiedSchema, XAIProvider, adaptToCoreLLMProvider, buildLLMProvider, buildSchemaPromptSection, combineSchemaAndUserPrompt, compareNativeVsOpenRouter, createProviderFromRegistry, createVLMProvider, estimateCost, formatSchemaForPrompt, getCheapestProvider, getProvidersForNode, isImageTypeSupported, isProviderCompatibleWithNode, providerRegistry, registerProvider, supportsPDFsInline };
1691
+ export { type AccessMethod, AnthropicProvider, BLOCK_TYPES, type BlockType, type CircuitBreakerState, type DocumentBlock, type FallbackConfig, FallbackManager, GEMINI_BBOX_EXTRACTION_PROMPT, type GeminiBoundingBoxBlock, GoogleProvider, type ImageInput, type JsonMode, type LLMDerivedOptions, type LLMExtractedMetadata, type LLMModelMetadata, type LLMProvider, type LLMProviderMetadata, type LLMProviderType, type LLMResponse, type MultimodalInput, type NodeType, type NormalizedBBox, OpenAIProvider, type PDFInput, PROVIDER_METADATA, type ProviderCapabilities, type ProviderConfig, type ProviderFactory, type ProviderInputType, type ProviderType, type ReasoningConfig, type ReasoningDetail, type ResourceLimits, type ResponseMetrics, SUPPORTED_IMAGE_TYPES, SchemaTranslator, type SupportedImageMimeType, type UnifiedSchema, XAIProvider, adaptToCoreLLMProvider, buildBlockClassificationPrompt, buildConfidencePrompt, buildLLMDerivedFeaturesPrompt, buildLLMProvider, buildLanguageHintsPrompt, buildOutputFormatPrompt, buildSchemaPromptSection, buildSourcesPrompt, combineSchemaAndUserPrompt, combineSchemaUserAndDerivedPrompts, compareNativeVsOpenRouter, convertGeminiBlocksToDocumentBlocks, createProviderFromRegistry, createVLMProvider, estimateCost, extractMetadataFromResponse, formatSchemaForPrompt, geminiBoundingBoxSchema, getCheapestProvider, getProvidersForNode, isImageTypeSupported, isProviderCompatibleWithNode, normalizeGeminiBBox, providerRegistry, registerProvider, shouldExtractMetadata, supportsPDFsInline, toGeminiBBox };