@doclo/core 0.1.12 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -511,6 +511,22 @@ type OutputFormatSupport = {
511
511
  html: boolean;
512
512
  json: boolean;
513
513
  };
514
+ /**
515
+ * Feature status values for normalized features.
516
+ * - `true`: Natively supported by the API
517
+ * - `false`: Not supported
518
+ * - `'deprecated'`: API deprecated this feature, may not work
519
+ * - `'derived'`: SDK provides via transformation (e.g., maxPages from pageRange)
520
+ */
521
+ type FeatureStatus = true | false | 'deprecated' | 'derived';
522
+ /**
523
+ * Helper to check if a feature is enabled (true, deprecated, or derived)
524
+ */
525
+ declare function isFeatureEnabled(status: FeatureStatus): boolean;
526
+ /**
527
+ * Page indexing convention used by provider
528
+ */
529
+ type PageIndexing = '0-indexed' | '1-indexed';
514
530
  /**
515
531
  * Normalized features across all providers.
516
532
  * Maps provider-specific option names to unified names.
@@ -520,47 +536,77 @@ type OutputFormatSupport = {
520
536
  */
521
537
  type NormalizedFeatures = {
522
538
  /** Limit to first N pages */
523
- maxPages: boolean;
539
+ maxPages: FeatureStatus;
524
540
  /** Specific page range selection */
525
- pageRange: boolean;
541
+ pageRange: FeatureStatus;
526
542
  /** OCR language hints (maps from 'langs') */
527
- languageHints: boolean;
543
+ languageHints: FeatureStatus;
528
544
  /** Quality/speed modes (fast/balanced/high_accuracy) */
529
- processingModes: boolean;
545
+ processingModes: FeatureStatus;
530
546
  /** Reducto agentic mode (higher accuracy, more cost) */
531
- agenticMode: boolean;
547
+ agenticMode: FeatureStatus;
532
548
  /** Custom prompts (maps from blockCorrectionPrompt, additionalPrompt, systemPrompt) */
533
- customPrompts: boolean;
549
+ customPrompts: FeatureStatus;
534
550
  /** Extract embedded images (maps from extractImages, returnImages) */
535
- imageExtraction: boolean;
551
+ imageExtraction: FeatureStatus;
536
552
  /** Page delimiters (maps from paginate, addPageMarkers) */
537
- pageMarkers: boolean;
538
- /** Field-level citations with source references */
539
- citations: boolean;
553
+ pageMarkers: FeatureStatus;
554
+ /** Field-level citations with source references (page/char/block indices) */
555
+ citations: FeatureStatus;
540
556
  /** Document chunking modes (RAG-optimized) */
541
- chunking: boolean;
557
+ chunking: FeatureStatus;
542
558
  /** Auto-segmentation for multi-document PDFs */
543
- segmentation: boolean;
559
+ segmentation: FeatureStatus;
544
560
  /** Re-run OCR on already-OCR'd documents */
545
- stripExistingOCR: boolean;
561
+ stripExistingOCR: FeatureStatus;
546
562
  /** Format lines in output */
547
- formatLines: boolean;
563
+ formatLines: FeatureStatus;
548
564
  /** Force OCR even if text layer exists */
549
- forceOCR: boolean;
565
+ forceOCR: FeatureStatus;
550
566
  /** Table format options (html/json/md/csv) */
551
- tableOutputFormats: boolean;
567
+ tableOutputFormats: FeatureStatus;
552
568
  /** Merge consecutive tables */
553
- tableMerging: boolean;
569
+ tableMerging: FeatureStatus;
554
570
  /** Block-level confidence scores */
555
- confidence: boolean;
556
- /** Bounding box coordinates for text/elements */
557
- boundingBoxes: boolean;
571
+ confidence: FeatureStatus;
572
+ /** Bounding box coordinates for TEXT elements (pixel/normalized coords) */
573
+ boundingBoxes: FeatureStatus;
574
+ /** Bounding box coordinates for IMAGES/FIGURES only (not text) */
575
+ imageBoundingBoxes: FeatureStatus;
558
576
  /** JSON schema validation for structured output */
559
- schemaValidation: boolean;
577
+ schemaValidation: FeatureStatus;
560
578
  /** Handwritten text recognition support */
561
- handwrittenText: boolean;
579
+ handwrittenText: FeatureStatus;
562
580
  /** Separate header/footer extraction from main content */
563
- headerFooterExtraction: boolean;
581
+ headerFooterExtraction: FeatureStatus;
582
+ /** Optimize output for embeddings/RAG */
583
+ embedOptimized: FeatureStatus;
584
+ /** Handle encrypted/password-protected PDFs */
585
+ passwordProtected: FeatureStatus;
586
+ /** Filter block types (headers, footers, page numbers, etc.) */
587
+ contentFiltering: FeatureStatus;
588
+ /** OCR system/mode selection (standard/legacy, auto/full) */
589
+ ocrMode: FeatureStatus;
590
+ /** Async completion webhook callbacks */
591
+ webhookCallback: FeatureStatus;
592
+ /** Vision quality control (low/medium/high) - Gemini */
593
+ mediaResolution: FeatureStatus;
594
+ /** Track changes extraction from Word docs */
595
+ changeTracking: FeatureStatus;
596
+ /** Extract hyperlinks from documents */
597
+ hyperlinkExtraction: FeatureStatus;
598
+ /** Enhanced chart and graph interpretation (Datalab extras=chart_understanding) */
599
+ chartUnderstanding: FeatureStatus;
600
+ /** Control image caption generation (Datalab disable_image_captions) */
601
+ imageCaptions: FeatureStatus;
602
+ /** Extract signatures from documents (Reducto include: ["signatures"]) */
603
+ signatureExtraction: FeatureStatus;
604
+ /** Extract comments/annotations from documents (Reducto include: ["comments"]) */
605
+ commentExtraction: FeatureStatus;
606
+ /** Extract highlighted text from documents (Reducto include: ["highlight"]) */
607
+ highlightExtraction: FeatureStatus;
608
+ /** Summarize figures/charts with VLM (Reducto summarize_figures) */
609
+ figureSummaries: FeatureStatus;
564
610
  /** Supported output formats */
565
611
  outputFormats: OutputFormatSupport;
566
612
  };
@@ -929,6 +975,64 @@ declare function getAllModels(): ResolvedModelMetadata[];
929
975
  * Clear model registry (useful for testing)
930
976
  */
931
977
  declare function clearModelRegistry(): void;
978
+ /**
979
+ * Get the page indexing convention for a provider.
980
+ *
981
+ * @param provider - Provider metadata or source string
982
+ * @returns Page indexing convention ('0-indexed' or '1-indexed')
983
+ */
984
+ declare function getPageIndexing(provider: NormalizedProviderMetadata | string): PageIndexing;
985
+ /**
986
+ * Options that can be transformed for derived features.
987
+ */
988
+ type DerivedFeatureOptions = {
989
+ maxPages?: number;
990
+ pageRange?: string;
991
+ };
992
+ /**
993
+ * Result of derived feature transformation.
994
+ */
995
+ type TransformedOptions = {
996
+ /** The transformed page_range parameter (provider-specific format) */
997
+ page_range?: string;
998
+ /** Array format for providers that support it (e.g., Mistral) */
999
+ pages?: number[];
1000
+ /** Original options minus the derived ones */
1001
+ remainingOptions: Record<string, unknown>;
1002
+ };
1003
+ /**
1004
+ * Transform maxPages to provider-specific pageRange format.
1005
+ *
1006
+ * This utility handles the conversion when a provider has `maxPages: 'derived'`,
1007
+ * meaning the SDK provides maxPages functionality via the underlying pageRange API.
1008
+ *
1009
+ * @param options - User-provided options including maxPages
1010
+ * @param provider - Provider metadata to determine format
1011
+ * @returns Transformed options with provider-specific pageRange
1012
+ *
1013
+ * @example
1014
+ * ```typescript
1015
+ * // User wants first 5 pages from Reducto (1-indexed)
1016
+ * const result = transformDerivedFeatures({ maxPages: 5 }, reductoProvider);
1017
+ * // => { page_range: '1-5', remainingOptions: {} }
1018
+ *
1019
+ * // User wants first 5 pages from Datalab (0-indexed)
1020
+ * const result = transformDerivedFeatures({ maxPages: 5 }, datalabProvider);
1021
+ * // => { page_range: '0-4', remainingOptions: {} }
1022
+ *
1023
+ * // User wants first 5 pages from Mistral (0-indexed, array format)
1024
+ * const result = transformDerivedFeatures({ maxPages: 5 }, mistralProvider);
1025
+ * // => { page_range: '0-4', pages: [0,1,2,3,4], remainingOptions: {} }
1026
+ * ```
1027
+ */
1028
+ declare function transformDerivedFeatures(options: DerivedFeatureOptions & Record<string, unknown>, provider: NormalizedProviderMetadata): TransformedOptions;
1029
+ /**
1030
+ * Check if a provider requires derived feature transformation for maxPages.
1031
+ *
1032
+ * @param provider - Provider metadata
1033
+ * @returns true if maxPages needs to be transformed to pageRange
1034
+ */
1035
+ declare function requiresMaxPagesTransformation(provider: NormalizedProviderMetadata): boolean;
932
1036
 
933
1037
  /**
934
1038
  * @doclo/core - Retry Utilities
@@ -1090,4 +1194,4 @@ declare function getCircuitBreaker(key: string): CircuitBreaker | undefined;
1090
1194
  */
1091
1195
  declare function withRetry<T>(fn: () => Promise<T>, options?: WithRetryOptions<T>): Promise<T>;
1092
1196
 
1093
- export { type AcceptedMimeType, AccessMethod, type AllAutoVariables, type AutoVariablesForNode, type BaseProviderConfig, type CategorizeAutoVariables, type CircuitBreaker, type CircuitBreakerConfig, type CircuitBreakerState, DEFAULT_CIRCUIT_BREAKER_CONFIG, DEFAULT_RETRY_CONFIG, type DocumentMimeType, type ExtractAutoVariables, type FeatureName, FlowInputValidationError, type InputRequirements, type ModelMetadata, type ModelQueryFilter, type NormalizedCapabilities, type NormalizedFeatures, type NormalizedProviderMetadata, type OCRProviderConfig, type OutputFormatSupport, type ParseAutoVariables, type PromptVariables, type ProviderConfig, type ProviderInputType, type ProviderInstance, type ProviderMetadataWithModels, type ProviderQueryFilter, type ProviderRegistry, type ProviderSecrets, ProviderVendor, type ResolvedModelMetadata, type RetryConfig, type VLMProviderConfig, type WithRetryOptions, bufferToBase64, bufferToDataUri, buildProviderFromConfig, buildProvidersFromConfigs, calculateRetryDelay, clearCircuitBreakers, clearModelRegistry, clearProviderRegistry, createCircuitBreaker, defineMarkerProvider, defineSuryaProvider, defineVLMProvider, detectDocumentType, detectMimeTypeFromBase64, detectMimeTypeFromBase64Async, detectMimeTypeFromBytes, extractBase64, extractStatusCode, getAllModels, getAllProviders, getCheapestProviderFor, getCircuitBreaker, getModelsForNode, getProviderById, getProvidersBySource, getProvidersForLargeFiles, getProvidersForMimeType, isPDFDocument, isRetryableError, parseRetryAfter, queryModels, queryProviders, registerProviderMetadata, registerProviderWithModels, resolveDocument, resolveModelMetadata, validateFlowInputFormat, validateMimeType, validateMimeTypeAsync, withRetry };
1197
+ export { type AcceptedMimeType, AccessMethod, type AllAutoVariables, type AutoVariablesForNode, type BaseProviderConfig, type CategorizeAutoVariables, type CircuitBreaker, type CircuitBreakerConfig, type CircuitBreakerState, DEFAULT_CIRCUIT_BREAKER_CONFIG, DEFAULT_RETRY_CONFIG, type DerivedFeatureOptions, type DocumentMimeType, type ExtractAutoVariables, type FeatureName, type FeatureStatus, FlowInputValidationError, type InputRequirements, type ModelMetadata, type ModelQueryFilter, type NormalizedCapabilities, type NormalizedFeatures, type NormalizedProviderMetadata, type OCRProviderConfig, type OutputFormatSupport, type PageIndexing, type ParseAutoVariables, type PromptVariables, type ProviderConfig, type ProviderInputType, type ProviderInstance, type ProviderMetadataWithModels, type ProviderQueryFilter, type ProviderRegistry, type ProviderSecrets, ProviderVendor, type ResolvedModelMetadata, type RetryConfig, type TransformedOptions, type VLMProviderConfig, type WithRetryOptions, bufferToBase64, bufferToDataUri, buildProviderFromConfig, buildProvidersFromConfigs, calculateRetryDelay, clearCircuitBreakers, clearModelRegistry, clearProviderRegistry, createCircuitBreaker, defineMarkerProvider, defineSuryaProvider, defineVLMProvider, detectDocumentType, detectMimeTypeFromBase64, detectMimeTypeFromBase64Async, detectMimeTypeFromBytes, extractBase64, extractStatusCode, getAllModels, getAllProviders, getCheapestProviderFor, getCircuitBreaker, getModelsForNode, getPageIndexing, getProviderById, getProvidersBySource, getProvidersForLargeFiles, getProvidersForMimeType, isFeatureEnabled, isPDFDocument, isRetryableError, parseRetryAfter, queryModels, queryProviders, registerProviderMetadata, registerProviderWithModels, requiresMaxPagesTransformation, resolveDocument, resolveModelMetadata, transformDerivedFeatures, validateFlowInputFormat, validateMimeType, validateMimeTypeAsync, withRetry };
package/dist/index.js CHANGED
@@ -1398,6 +1398,9 @@ function createIdentity(provider, model, opts) {
1398
1398
  }
1399
1399
 
1400
1400
  // src/provider-query.ts
1401
+ function isFeatureEnabled(status) {
1402
+ return status === true || status === "deprecated" || status === "derived";
1403
+ }
1401
1404
  var providerRegistry = /* @__PURE__ */ new Map();
1402
1405
  function registerProviderMetadata(source, metadata, normalizer) {
1403
1406
  const normalized = /* @__PURE__ */ new Map();
@@ -1482,7 +1485,7 @@ function queryProviders(filter = {}) {
1482
1485
  }
1483
1486
  if (filter.hasFeatures && filter.hasFeatures.length > 0) {
1484
1487
  providers = providers.filter(
1485
- (p) => filter.hasFeatures.every((feature) => p.features[feature] === true)
1488
+ (p) => filter.hasFeatures.every((feature) => isFeatureEnabled(p.features[feature]))
1486
1489
  );
1487
1490
  }
1488
1491
  if (filter.outputFormat) {
@@ -1581,9 +1584,25 @@ function defaultNormalizer(id, data, source) {
1581
1584
  tableMerging: false,
1582
1585
  confidence: false,
1583
1586
  boundingBoxes: false,
1587
+ imageBoundingBoxes: false,
1584
1588
  schemaValidation: false,
1585
1589
  handwrittenText: false,
1586
1590
  headerFooterExtraction: false,
1591
+ // Extended features
1592
+ embedOptimized: false,
1593
+ passwordProtected: false,
1594
+ contentFiltering: false,
1595
+ ocrMode: false,
1596
+ webhookCallback: false,
1597
+ mediaResolution: false,
1598
+ changeTracking: false,
1599
+ hyperlinkExtraction: false,
1600
+ chartUnderstanding: false,
1601
+ imageCaptions: false,
1602
+ signatureExtraction: false,
1603
+ commentExtraction: false,
1604
+ highlightExtraction: false,
1605
+ figureSummaries: false,
1587
1606
  outputFormats: defaultOutputFormats
1588
1607
  };
1589
1608
  return {
@@ -1638,10 +1657,12 @@ function normalizeLLMProvider(id, d) {
1638
1657
  html: true,
1639
1658
  json: d.capabilities?.supportsStructuredOutput ?? true
1640
1659
  };
1660
+ const vendor = d.vendor ?? id;
1641
1661
  const features = {
1642
- maxPages: d.inputFormats?.pdfs?.maxPages !== void 0,
1643
- pageRange: true,
1644
- // LLMs can handle page ranges
1662
+ maxPages: "derived",
1663
+ // SDK can limit via pre-processing
1664
+ pageRange: false,
1665
+ // No native API support - LLMs receive full text
1645
1666
  languageHints: false,
1646
1667
  // Not applicable to LLMs
1647
1668
  processingModes: false,
@@ -1654,8 +1675,8 @@ function normalizeLLMProvider(id, d) {
1654
1675
  // LLMs don't extract images
1655
1676
  pageMarkers: false,
1656
1677
  // LLMs don't add page markers
1657
- citations: false,
1658
- // Most LLMs don't have native citations (Anthropic has different API)
1678
+ citations: vendor === "anthropic" ? true : false,
1679
+ // Anthropic has Citations API
1659
1680
  chunking: false,
1660
1681
  // LLMs don't do chunking
1661
1682
  segmentation: false,
@@ -1669,15 +1690,32 @@ function normalizeLLMProvider(id, d) {
1669
1690
  // LLMs don't provide confidence scores
1670
1691
  boundingBoxes: false,
1671
1692
  // LLMs don't provide bounding boxes
1693
+ imageBoundingBoxes: false,
1694
+ // LLMs don't provide image bounding boxes (Gemini 2.0+ can via specific prompting, but not a simple toggle)
1672
1695
  schemaValidation: d.capabilities?.supportsStructuredOutput ?? false,
1673
1696
  // Some LLMs support schema validation
1674
1697
  handwrittenText: false,
1675
1698
  // Not specific to LLMs
1676
1699
  headerFooterExtraction: false,
1677
1700
  // LLMs don't extract header/footer separately
1701
+ // Extended features
1702
+ embedOptimized: false,
1703
+ passwordProtected: false,
1704
+ contentFiltering: false,
1705
+ ocrMode: false,
1706
+ webhookCallback: false,
1707
+ mediaResolution: vendor === "google" ? true : false,
1708
+ // Google Gemini has mediaResolution
1709
+ changeTracking: false,
1710
+ hyperlinkExtraction: false,
1711
+ chartUnderstanding: false,
1712
+ imageCaptions: false,
1713
+ signatureExtraction: false,
1714
+ commentExtraction: false,
1715
+ highlightExtraction: false,
1716
+ figureSummaries: false,
1678
1717
  outputFormats
1679
1718
  };
1680
- const vendor = d.vendor ?? id;
1681
1719
  return {
1682
1720
  id,
1683
1721
  name: d.name ?? id,
@@ -1698,7 +1736,8 @@ function normalizeLLMProvider(id, d) {
1698
1736
  supportsStructuredOutput: d.capabilities?.supportsStructuredOutput ?? false,
1699
1737
  // NEW capabilities
1700
1738
  supportsPrompts: true,
1701
- supportsCitations: false,
1739
+ supportsCitations: vendor === "anthropic",
1740
+ // Anthropic has Citations API
1702
1741
  supportsChunking: false,
1703
1742
  supportsImageExtraction: false,
1704
1743
  supportsPageMarkers: false,
@@ -1745,6 +1784,8 @@ function normalizeLLMProvider(id, d) {
1745
1784
  function normalizeDatalabProvider(id, d) {
1746
1785
  const opts = d.supportedOptions ?? {};
1747
1786
  const isVLM = d.type === "VLM";
1787
+ const isMarkerOCR = id === "marker-ocr" || id.includes("marker-ocr");
1788
+ const isMarkerVLM = id === "marker-vlm" || id.includes("marker-vlm");
1748
1789
  const model = d.model ?? id;
1749
1790
  const outputFormats = {
1750
1791
  text: true,
@@ -1755,35 +1796,61 @@ function normalizeDatalabProvider(id, d) {
1755
1796
  const features = {
1756
1797
  maxPages: opts.maxPages ?? false,
1757
1798
  pageRange: opts.pageRange ?? false,
1758
- languageHints: opts.langs ?? false,
1759
- // maps from 'langs'
1799
+ languageHints: opts.langs ? "deprecated" : false,
1800
+ // API ignores, handled automatically
1760
1801
  processingModes: opts.mode ?? false,
1761
1802
  agenticMode: false,
1762
1803
  // Datalab doesn't have agentic mode
1763
- customPrompts: opts.blockCorrectionPrompt ?? false,
1804
+ customPrompts: opts.blockCorrectionPrompt ? "deprecated" : false,
1805
+ // Not currently supported
1764
1806
  imageExtraction: opts.extractImages ?? false,
1765
1807
  pageMarkers: opts.paginate ?? false,
1766
1808
  // maps from 'paginate'
1767
- citations: opts.citations ?? false,
1809
+ citations: isMarkerVLM ? true : false,
1810
+ // Marker VLM has citations
1768
1811
  chunking: false,
1769
1812
  // Datalab doesn't have chunking
1770
1813
  segmentation: opts.segmentation ?? false,
1771
- stripExistingOCR: opts.stripExistingOCR ?? false,
1772
- formatLines: opts.formatLines ?? false,
1773
- forceOCR: true,
1774
- // Datalab supports force_ocr
1814
+ stripExistingOCR: opts.stripExistingOCR ? "deprecated" : false,
1815
+ // Managed automatically
1816
+ formatLines: opts.formatLines ? "deprecated" : false,
1817
+ // Handled automatically
1818
+ forceOCR: "deprecated",
1819
+ // DEPRECATED: force_ocr param has no effect per API docs
1775
1820
  tableOutputFormats: false,
1776
1821
  tableMerging: false,
1777
1822
  confidence: false,
1778
1823
  // Datalab doesn't provide confidence scores
1779
1824
  boundingBoxes: d.outputFormat?.features?.boundingBoxes ?? true,
1780
- // Datalab provides bounding boxes
1825
+ // Datalab Surya provides text bboxes
1826
+ imageBoundingBoxes: isMarkerOCR || isMarkerVLM ? true : false,
1827
+ // Marker extracts images with bboxes
1781
1828
  schemaValidation: isVLM,
1782
1829
  // VLM providers support schema validation
1783
1830
  handwrittenText: true,
1784
1831
  // Datalab handles handwritten text
1785
1832
  headerFooterExtraction: false,
1786
1833
  // Datalab has issues with header/footer extraction
1834
+ // Extended features
1835
+ embedOptimized: false,
1836
+ passwordProtected: false,
1837
+ contentFiltering: false,
1838
+ ocrMode: false,
1839
+ webhookCallback: true,
1840
+ // Datalab supports webhook callbacks
1841
+ mediaResolution: false,
1842
+ changeTracking: true,
1843
+ // Datalab marker_extras supports track_changes
1844
+ hyperlinkExtraction: isMarkerOCR || isMarkerVLM,
1845
+ // Datalab extras=extract_links
1846
+ chartUnderstanding: isMarkerOCR || isMarkerVLM,
1847
+ // Datalab extras=chart_understanding
1848
+ imageCaptions: isMarkerOCR || isMarkerVLM,
1849
+ // Datalab disable_image_captions param
1850
+ signatureExtraction: false,
1851
+ commentExtraction: false,
1852
+ highlightExtraction: false,
1853
+ figureSummaries: false,
1787
1854
  outputFormats
1788
1855
  };
1789
1856
  return {
@@ -1852,6 +1919,7 @@ function normalizeReductoProvider(id, d) {
1852
1919
  const opts = d.supportedOptions ?? {};
1853
1920
  const isVLM = d.type === "VLM";
1854
1921
  const isExtract = d.compatibleNodes?.extract === true;
1922
+ const isParse = d.compatibleNodes?.parse === true;
1855
1923
  const model = d.model ?? "v1";
1856
1924
  const outputFormats = {
1857
1925
  text: d.outputFormat?.features?.textLines ?? true,
@@ -1861,10 +1929,11 @@ function normalizeReductoProvider(id, d) {
1861
1929
  json: d.outputFormat?.features?.structuredJSON ?? isExtract
1862
1930
  };
1863
1931
  const features = {
1864
- maxPages: opts.maxPages ?? false,
1932
+ maxPages: opts.pageRange ?? false ? "derived" : false,
1933
+ // SDK derives from pageRange (1-indexed)
1865
1934
  pageRange: opts.pageRange ?? false,
1866
- languageHints: opts.langs ?? false,
1867
- // Reducto doesn't support langs
1935
+ languageHints: false,
1936
+ // Reducto doesn't support language hints
1868
1937
  processingModes: false,
1869
1938
  // Reducto uses agentic instead
1870
1939
  agenticMode: opts.mode ?? false,
@@ -1887,14 +1956,44 @@ function normalizeReductoProvider(id, d) {
1887
1956
  // Parse has mergeTables
1888
1957
  confidence: opts.confidence ?? d.outputFormat?.features?.confidence ?? false,
1889
1958
  // Reducto Parse has confidence
1890
- boundingBoxes: d.outputFormat?.features?.boundingBoxes ?? d.compatibleNodes?.parse ?? false,
1891
- // Reducto Parse has bounding boxes
1959
+ boundingBoxes: d.outputFormat?.features?.boundingBoxes ?? isParse,
1960
+ // Reducto Parse has text bounding boxes
1961
+ imageBoundingBoxes: isParse ? true : false,
1962
+ // Reducto Parse has figure bounding boxes
1892
1963
  schemaValidation: d.outputFormat?.features?.schemaValidation ?? isExtract,
1893
1964
  // Extract has schema validation
1894
1965
  handwrittenText: false,
1895
1966
  // Reducto doesn't specifically advertise handwriting
1896
1967
  headerFooterExtraction: true,
1897
1968
  // Reducto has Header/Footer block types
1969
+ // Extended features
1970
+ embedOptimized: isParse,
1971
+ // Reducto Parse supports retrieval.embedding_optimized: true
1972
+ passwordProtected: true,
1973
+ // Reducto handles encrypted PDFs
1974
+ contentFiltering: true,
1975
+ // Reducto can filter block types
1976
+ ocrMode: opts.ocrSystem ?? false,
1977
+ // Reducto has ocr_system selection
1978
+ webhookCallback: true,
1979
+ // Reducto supports webhook callbacks
1980
+ mediaResolution: false,
1981
+ changeTracking: true,
1982
+ // Reducto tracks changes in Word docs
1983
+ hyperlinkExtraction: true,
1984
+ // Reducto extracts hyperlinks via formatting.include
1985
+ chartUnderstanding: isParse,
1986
+ // Reducto enhance.agentic[].advanced_chart_agent for figures
1987
+ imageCaptions: false,
1988
+ // Not available in Reducto
1989
+ signatureExtraction: false,
1990
+ // NOT supported - formatting.include only accepts: change_tracking, highlight, comments, hyperlinks
1991
+ commentExtraction: isParse || isExtract,
1992
+ // Reducto formatting.include: ["comments"]
1993
+ highlightExtraction: isParse || isExtract,
1994
+ // Reducto formatting.include: ["highlight"]
1995
+ figureSummaries: isParse,
1996
+ // Reducto enhance.summarize_figures
1898
1997
  outputFormats
1899
1998
  };
1900
1999
  return {
@@ -1980,7 +2079,8 @@ function normalizeUnsiloedProvider(id, d) {
1980
2079
  // Unsiloed doesn't have page range option
1981
2080
  languageHints: false,
1982
2081
  // Unsiloed doesn't support language hints
1983
- processingModes: d.capabilities?.specialFeatures?.includes("YOLO segmentation") ?? false,
2082
+ processingModes: false,
2083
+ // Unsiloed doesn't have fast/balanced/high_accuracy modes like Datalab
1984
2084
  agenticMode: false,
1985
2085
  // Unsiloed doesn't have agentic mode
1986
2086
  customPrompts: false,
@@ -2002,14 +2102,40 @@ function normalizeUnsiloedProvider(id, d) {
2002
2102
  tableMerging: false,
2003
2103
  confidence: d.outputFormat?.features?.confidence ?? false,
2004
2104
  // Unsiloed may provide confidence
2005
- boundingBoxes: d.outputFormat?.features?.boundingBoxes ?? false,
2006
- // Unsiloed may provide bounding boxes
2105
+ boundingBoxes: d.outputFormat?.features?.boundingBoxes ?? isParse,
2106
+ // Unsiloed Parse has bounding boxes
2107
+ imageBoundingBoxes: false,
2108
+ // Unsiloed doesn't return image-specific bboxes
2007
2109
  schemaValidation: isExtract,
2008
2110
  // Extract supports schema validation
2009
- handwrittenText: false,
2010
- // Unsiloed doesn't specifically advertise handwriting
2111
+ handwrittenText: d.capabilities?.specialFeatures?.includes("handwritten text") ?? false,
2112
+ // Parse supports handwriting
2011
2113
  headerFooterExtraction: false,
2012
2114
  // Unsiloed doesn't extract header/footer separately
2115
+ // Extended features
2116
+ embedOptimized: false,
2117
+ passwordProtected: false,
2118
+ contentFiltering: isParse,
2119
+ // Parse supports keep_segment_types: ["table", "picture", "formula", "text"]
2120
+ ocrMode: isParse,
2121
+ // Parse endpoint supports ocr_mode: 'auto_ocr' | 'full_ocr'
2122
+ webhookCallback: false,
2123
+ // Unsiloed is synchronous
2124
+ mediaResolution: false,
2125
+ changeTracking: false,
2126
+ hyperlinkExtraction: false,
2127
+ chartUnderstanding: false,
2128
+ // Not available in Unsiloed
2129
+ imageCaptions: false,
2130
+ // Not available in Unsiloed
2131
+ signatureExtraction: false,
2132
+ // Not available in Unsiloed
2133
+ commentExtraction: false,
2134
+ // Not available in Unsiloed
2135
+ highlightExtraction: false,
2136
+ // Not available in Unsiloed
2137
+ figureSummaries: false,
2138
+ // Not available in Unsiloed
2013
2139
  outputFormats
2014
2140
  };
2015
2141
  return {
@@ -2038,7 +2164,8 @@ function normalizeUnsiloedProvider(id, d) {
2038
2164
  supportsImageExtraction: false,
2039
2165
  supportsPageMarkers: false,
2040
2166
  supportsLanguageHints: false,
2041
- supportsProcessingModes: d.capabilities?.specialFeatures?.includes("YOLO segmentation") ?? false,
2167
+ supportsProcessingModes: false,
2168
+ // Unsiloed doesn't have fast/balanced/high_accuracy modes
2042
2169
  supportsSegmentation: isSplit || isCategorize,
2043
2170
  outputFormats
2044
2171
  },
@@ -2256,7 +2383,7 @@ function matchesModelFilter(model, filter) {
2256
2383
  }
2257
2384
  if (filter.hasFeatures && filter.hasFeatures.length > 0) {
2258
2385
  for (const feature of filter.hasFeatures) {
2259
- if (model.features[feature] !== true) {
2386
+ if (!isFeatureEnabled(model.features[feature])) {
2260
2387
  return false;
2261
2388
  }
2262
2389
  }
@@ -2299,6 +2426,44 @@ function getAllModels() {
2299
2426
  function clearModelRegistry() {
2300
2427
  modelRegistry.clear();
2301
2428
  }
2429
+ var PAGE_INDEXING = {
2430
+ datalab: "0-indexed",
2431
+ reducto: "1-indexed",
2432
+ mistral: "0-indexed",
2433
+ unsiloed: "1-indexed",
2434
+ // Default assumption
2435
+ llm: "1-indexed"
2436
+ // N/A but default
2437
+ };
2438
+ function getPageIndexing(provider) {
2439
+ const source = typeof provider === "string" ? provider : provider.source;
2440
+ return PAGE_INDEXING[source] ?? "1-indexed";
2441
+ }
2442
+ function transformDerivedFeatures(options, provider) {
2443
+ const { maxPages, pageRange, ...remainingOptions } = options;
2444
+ const result = { remainingOptions };
2445
+ if (pageRange !== void 0) {
2446
+ result.page_range = pageRange;
2447
+ return result;
2448
+ }
2449
+ if (maxPages !== void 0 && provider.features.maxPages === "derived") {
2450
+ const indexing = getPageIndexing(provider);
2451
+ if (indexing === "0-indexed") {
2452
+ result.page_range = `0-${maxPages - 1}`;
2453
+ if (provider.source === "mistral") {
2454
+ result.pages = Array.from({ length: maxPages }, (_, i) => i);
2455
+ }
2456
+ } else {
2457
+ result.page_range = `1-${maxPages}`;
2458
+ }
2459
+ } else if (maxPages !== void 0 && isFeatureEnabled(provider.features.maxPages)) {
2460
+ result.remainingOptions.maxPages = maxPages;
2461
+ }
2462
+ return result;
2463
+ }
2464
+ function requiresMaxPagesTransformation(provider) {
2465
+ return provider.features.maxPages === "derived";
2466
+ }
2302
2467
  function normalizeMistralProvider(id, d) {
2303
2468
  const opts = d.supportedOptions ?? {};
2304
2469
  const isVLM = d.type === "VLM";
@@ -2314,7 +2479,7 @@ function normalizeMistralProvider(id, d) {
2314
2479
  const features = {
2315
2480
  maxPages: d.inputFormats?.maxPages !== void 0,
2316
2481
  pageRange: true,
2317
- // Mistral supports pages param: "0-5" or [0,2,5]
2482
+ // Mistral supports pages param: "0-5" or [0,2,5] (0-indexed)
2318
2483
  languageHints: false,
2319
2484
  // Mistral doesn't support language hints
2320
2485
  processingModes: false,
@@ -2342,14 +2507,39 @@ function normalizeMistralProvider(id, d) {
2342
2507
  tableMerging: false,
2343
2508
  confidence: false,
2344
2509
  // Mistral doesn't provide confidence scores
2345
- boundingBoxes: d.outputFormat?.features?.boundingBoxes ?? false,
2346
- // NO text-level bboxes
2510
+ boundingBoxes: false,
2511
+ // Mistral does NOT provide text-level bounding boxes
2512
+ imageBoundingBoxes: true,
2513
+ // Mistral provides image/figure bounding boxes only
2347
2514
  schemaValidation: d.outputFormat?.features?.schemaValidation ?? isVLM,
2348
2515
  // VLM supports schema
2349
2516
  handwrittenText: d.outputFormat?.features?.handwrittenText ?? true,
2350
2517
  // Excellent handwriting support
2351
2518
  headerFooterExtraction: opts.extractHeader ?? opts.extractFooter ?? false,
2352
2519
  // extract_header/extract_footer
2520
+ // Extended features
2521
+ embedOptimized: false,
2522
+ passwordProtected: false,
2523
+ contentFiltering: false,
2524
+ ocrMode: false,
2525
+ webhookCallback: false,
2526
+ // Mistral is synchronous
2527
+ mediaResolution: false,
2528
+ changeTracking: false,
2529
+ hyperlinkExtraction: true,
2530
+ // Response pages[].hyperlinks[] auto-extracted
2531
+ chartUnderstanding: false,
2532
+ // Not available as separate feature in Mistral
2533
+ imageCaptions: false,
2534
+ // Not available in Mistral
2535
+ signatureExtraction: false,
2536
+ // Not available in Mistral
2537
+ commentExtraction: false,
2538
+ // Not available in Mistral
2539
+ highlightExtraction: false,
2540
+ // Not available in Mistral
2541
+ figureSummaries: false,
2542
+ // Not available in Mistral
2353
2543
  outputFormats
2354
2544
  };
2355
2545
  return {
@@ -2366,8 +2556,8 @@ function normalizeMistralProvider(id, d) {
2366
2556
  capabilities: {
2367
2557
  supportsImages: d.capabilities?.supportsImages ?? true,
2368
2558
  supportsPDFs: d.capabilities?.supportsPDFs ?? true,
2369
- supportsDocuments: d.capabilities?.supportsDocuments ?? false,
2370
- // DOCX/PPTX has known issues
2559
+ supportsDocuments: d.capabilities?.supportsDocuments ?? true,
2560
+ // Supports DOCX, PPTX, TXT, EPUB, RTF, ODT, etc. (NOT XLSX)
2371
2561
  supportsReasoning: false,
2372
2562
  // OCR 3 doesn't do reasoning
2373
2563
  supportsStructuredOutput: d.capabilities?.supportsStructuredOutput ?? isVLM,
@@ -2635,6 +2825,7 @@ export {
2635
2825
  getNodeTypeName,
2636
2826
  getPDFPageCount,
2637
2827
  getPageCountMetadata,
2828
+ getPageIndexing,
2638
2829
  getProviderById,
2639
2830
  getProvidersBySource,
2640
2831
  getProvidersForLargeFiles,
@@ -2642,6 +2833,7 @@ export {
2642
2833
  getSuggestedConnections,
2643
2834
  getTotalPageCount,
2644
2835
  getValidForEachStarters,
2836
+ isFeatureEnabled,
2645
2837
  isLocalEndpoint,
2646
2838
  isPDFDocument,
2647
2839
  isRetryableError,
@@ -2653,11 +2845,13 @@ export {
2653
2845
  queryProviders,
2654
2846
  registerProviderMetadata,
2655
2847
  registerProviderWithModels,
2848
+ requiresMaxPagesTransformation,
2656
2849
  resolveDocument,
2657
2850
  resolveModelMetadata,
2658
2851
  runPipeline,
2659
2852
  splitPDFIntoChunks,
2660
2853
  toProviderString,
2854
+ transformDerivedFeatures,
2661
2855
  validateFlowInputFormat,
2662
2856
  validateJson,
2663
2857
  validateMimeType,