@doclo/core 0.1.11 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -1398,6 +1398,9 @@ function createIdentity(provider, model, opts) {
1398
1398
  }
1399
1399
 
1400
1400
  // src/provider-query.ts
1401
+ function isFeatureEnabled(status) {
1402
+ return status === true || status === "deprecated" || status === "derived";
1403
+ }
1401
1404
  var providerRegistry = /* @__PURE__ */ new Map();
1402
1405
  function registerProviderMetadata(source, metadata, normalizer) {
1403
1406
  const normalized = /* @__PURE__ */ new Map();
@@ -1482,7 +1485,7 @@ function queryProviders(filter = {}) {
1482
1485
  }
1483
1486
  if (filter.hasFeatures && filter.hasFeatures.length > 0) {
1484
1487
  providers = providers.filter(
1485
- (p) => filter.hasFeatures.every((feature) => p.features[feature] === true)
1488
+ (p) => filter.hasFeatures.every((feature) => isFeatureEnabled(p.features[feature]))
1486
1489
  );
1487
1490
  }
1488
1491
  if (filter.outputFormat) {
@@ -1558,6 +1561,8 @@ function defaultNormalizer(id, data, source) {
1558
1561
  return normalizeReductoProvider(id, d);
1559
1562
  } else if (source === "unsiloed") {
1560
1563
  return normalizeUnsiloedProvider(id, d);
1564
+ } else if (source === "mistral") {
1565
+ return normalizeMistralProvider(id, d);
1561
1566
  }
1562
1567
  const defaultOutputFormats = { text: true, markdown: false, html: false, json: false };
1563
1568
  const defaultFeatures = {
@@ -1579,8 +1584,25 @@ function defaultNormalizer(id, data, source) {
1579
1584
  tableMerging: false,
1580
1585
  confidence: false,
1581
1586
  boundingBoxes: false,
1587
+ imageBoundingBoxes: false,
1582
1588
  schemaValidation: false,
1583
1589
  handwrittenText: false,
1590
+ headerFooterExtraction: false,
1591
+ // Extended features
1592
+ embedOptimized: false,
1593
+ passwordProtected: false,
1594
+ contentFiltering: false,
1595
+ ocrMode: false,
1596
+ webhookCallback: false,
1597
+ mediaResolution: false,
1598
+ changeTracking: false,
1599
+ hyperlinkExtraction: false,
1600
+ chartUnderstanding: false,
1601
+ imageCaptions: false,
1602
+ signatureExtraction: false,
1603
+ commentExtraction: false,
1604
+ highlightExtraction: false,
1605
+ figureSummaries: false,
1584
1606
  outputFormats: defaultOutputFormats
1585
1607
  };
1586
1608
  return {
@@ -1635,10 +1657,12 @@ function normalizeLLMProvider(id, d) {
1635
1657
  html: true,
1636
1658
  json: d.capabilities?.supportsStructuredOutput ?? true
1637
1659
  };
1660
+ const vendor = d.vendor ?? id;
1638
1661
  const features = {
1639
- maxPages: d.inputFormats?.pdfs?.maxPages !== void 0,
1640
- pageRange: true,
1641
- // LLMs can handle page ranges
1662
+ maxPages: "derived",
1663
+ // SDK can limit via pre-processing
1664
+ pageRange: false,
1665
+ // No native API support - LLMs receive full text
1642
1666
  languageHints: false,
1643
1667
  // Not applicable to LLMs
1644
1668
  processingModes: false,
@@ -1651,8 +1675,8 @@ function normalizeLLMProvider(id, d) {
1651
1675
  // LLMs don't extract images
1652
1676
  pageMarkers: false,
1653
1677
  // LLMs don't add page markers
1654
- citations: false,
1655
- // Most LLMs don't have native citations (Anthropic has different API)
1678
+ citations: vendor === "anthropic" ? true : false,
1679
+ // Anthropic has Citations API
1656
1680
  chunking: false,
1657
1681
  // LLMs don't do chunking
1658
1682
  segmentation: false,
@@ -1666,13 +1690,32 @@ function normalizeLLMProvider(id, d) {
1666
1690
  // LLMs don't provide confidence scores
1667
1691
  boundingBoxes: false,
1668
1692
  // LLMs don't provide bounding boxes
1693
+ imageBoundingBoxes: false,
1694
+ // LLMs don't provide image bounding boxes (Gemini 2.0+ can via specific prompting, but not a simple toggle)
1669
1695
  schemaValidation: d.capabilities?.supportsStructuredOutput ?? false,
1670
1696
  // Some LLMs support schema validation
1671
1697
  handwrittenText: false,
1672
1698
  // Not specific to LLMs
1699
+ headerFooterExtraction: false,
1700
+ // LLMs don't extract header/footer separately
1701
+ // Extended features
1702
+ embedOptimized: false,
1703
+ passwordProtected: false,
1704
+ contentFiltering: false,
1705
+ ocrMode: false,
1706
+ webhookCallback: false,
1707
+ mediaResolution: vendor === "google" ? true : false,
1708
+ // Google Gemini has mediaResolution
1709
+ changeTracking: false,
1710
+ hyperlinkExtraction: false,
1711
+ chartUnderstanding: false,
1712
+ imageCaptions: false,
1713
+ signatureExtraction: false,
1714
+ commentExtraction: false,
1715
+ highlightExtraction: false,
1716
+ figureSummaries: false,
1673
1717
  outputFormats
1674
1718
  };
1675
- const vendor = d.vendor ?? id;
1676
1719
  return {
1677
1720
  id,
1678
1721
  name: d.name ?? id,
@@ -1693,7 +1736,8 @@ function normalizeLLMProvider(id, d) {
1693
1736
  supportsStructuredOutput: d.capabilities?.supportsStructuredOutput ?? false,
1694
1737
  // NEW capabilities
1695
1738
  supportsPrompts: true,
1696
- supportsCitations: false,
1739
+ supportsCitations: vendor === "anthropic",
1740
+ // Anthropic has Citations API
1697
1741
  supportsChunking: false,
1698
1742
  supportsImageExtraction: false,
1699
1743
  supportsPageMarkers: false,
@@ -1740,6 +1784,8 @@ function normalizeLLMProvider(id, d) {
1740
1784
  function normalizeDatalabProvider(id, d) {
1741
1785
  const opts = d.supportedOptions ?? {};
1742
1786
  const isVLM = d.type === "VLM";
1787
+ const isMarkerOCR = id === "marker-ocr" || id.includes("marker-ocr");
1788
+ const isMarkerVLM = id === "marker-vlm" || id.includes("marker-vlm");
1743
1789
  const model = d.model ?? id;
1744
1790
  const outputFormats = {
1745
1791
  text: true,
@@ -1750,33 +1796,61 @@ function normalizeDatalabProvider(id, d) {
1750
1796
  const features = {
1751
1797
  maxPages: opts.maxPages ?? false,
1752
1798
  pageRange: opts.pageRange ?? false,
1753
- languageHints: opts.langs ?? false,
1754
- // maps from 'langs'
1799
+ languageHints: opts.langs ? "deprecated" : false,
1800
+ // API ignores, handled automatically
1755
1801
  processingModes: opts.mode ?? false,
1756
1802
  agenticMode: false,
1757
1803
  // Datalab doesn't have agentic mode
1758
- customPrompts: opts.blockCorrectionPrompt ?? false,
1804
+ customPrompts: opts.blockCorrectionPrompt ? "deprecated" : false,
1805
+ // Not currently supported
1759
1806
  imageExtraction: opts.extractImages ?? false,
1760
1807
  pageMarkers: opts.paginate ?? false,
1761
1808
  // maps from 'paginate'
1762
- citations: opts.citations ?? false,
1809
+ citations: isMarkerVLM ? true : false,
1810
+ // Marker VLM has citations
1763
1811
  chunking: false,
1764
1812
  // Datalab doesn't have chunking
1765
1813
  segmentation: opts.segmentation ?? false,
1766
- stripExistingOCR: opts.stripExistingOCR ?? false,
1767
- formatLines: opts.formatLines ?? false,
1768
- forceOCR: true,
1769
- // Datalab supports force_ocr
1814
+ stripExistingOCR: opts.stripExistingOCR ? "deprecated" : false,
1815
+ // Managed automatically
1816
+ formatLines: opts.formatLines ? "deprecated" : false,
1817
+ // Handled automatically
1818
+ forceOCR: "deprecated",
1819
+ // DEPRECATED: force_ocr param has no effect per API docs
1770
1820
  tableOutputFormats: false,
1771
1821
  tableMerging: false,
1772
1822
  confidence: false,
1773
1823
  // Datalab doesn't provide confidence scores
1774
1824
  boundingBoxes: d.outputFormat?.features?.boundingBoxes ?? true,
1775
- // Datalab provides bounding boxes
1825
+ // Datalab Surya provides text bboxes
1826
+ imageBoundingBoxes: isMarkerOCR || isMarkerVLM ? true : false,
1827
+ // Marker extracts images with bboxes
1776
1828
  schemaValidation: isVLM,
1777
1829
  // VLM providers support schema validation
1778
1830
  handwrittenText: true,
1779
1831
  // Datalab handles handwritten text
1832
+ headerFooterExtraction: false,
1833
+ // Datalab has issues with header/footer extraction
1834
+ // Extended features
1835
+ embedOptimized: false,
1836
+ passwordProtected: false,
1837
+ contentFiltering: false,
1838
+ ocrMode: false,
1839
+ webhookCallback: true,
1840
+ // Datalab supports webhook callbacks
1841
+ mediaResolution: false,
1842
+ changeTracking: true,
1843
+ // Datalab marker_extras supports track_changes
1844
+ hyperlinkExtraction: isMarkerOCR || isMarkerVLM,
1845
+ // Datalab extras=extract_links
1846
+ chartUnderstanding: isMarkerOCR || isMarkerVLM,
1847
+ // Datalab extras=chart_understanding
1848
+ imageCaptions: isMarkerOCR || isMarkerVLM,
1849
+ // Datalab disable_image_captions param
1850
+ signatureExtraction: false,
1851
+ commentExtraction: false,
1852
+ highlightExtraction: false,
1853
+ figureSummaries: false,
1780
1854
  outputFormats
1781
1855
  };
1782
1856
  return {
@@ -1845,6 +1919,7 @@ function normalizeReductoProvider(id, d) {
1845
1919
  const opts = d.supportedOptions ?? {};
1846
1920
  const isVLM = d.type === "VLM";
1847
1921
  const isExtract = d.compatibleNodes?.extract === true;
1922
+ const isParse = d.compatibleNodes?.parse === true;
1848
1923
  const model = d.model ?? "v1";
1849
1924
  const outputFormats = {
1850
1925
  text: d.outputFormat?.features?.textLines ?? true,
@@ -1854,10 +1929,11 @@ function normalizeReductoProvider(id, d) {
1854
1929
  json: d.outputFormat?.features?.structuredJSON ?? isExtract
1855
1930
  };
1856
1931
  const features = {
1857
- maxPages: opts.maxPages ?? false,
1932
+ maxPages: opts.pageRange ?? false ? "derived" : false,
1933
+ // SDK derives from pageRange (1-indexed)
1858
1934
  pageRange: opts.pageRange ?? false,
1859
- languageHints: opts.langs ?? false,
1860
- // Reducto doesn't support langs
1935
+ languageHints: false,
1936
+ // Reducto doesn't support language hints
1861
1937
  processingModes: false,
1862
1938
  // Reducto uses agentic instead
1863
1939
  agenticMode: opts.mode ?? false,
@@ -1880,12 +1956,44 @@ function normalizeReductoProvider(id, d) {
1880
1956
  // Parse has mergeTables
1881
1957
  confidence: opts.confidence ?? d.outputFormat?.features?.confidence ?? false,
1882
1958
  // Reducto Parse has confidence
1883
- boundingBoxes: d.outputFormat?.features?.boundingBoxes ?? d.compatibleNodes?.parse ?? false,
1884
- // Reducto Parse has bounding boxes
1959
+ boundingBoxes: d.outputFormat?.features?.boundingBoxes ?? isParse,
1960
+ // Reducto Parse has text bounding boxes
1961
+ imageBoundingBoxes: isParse ? true : false,
1962
+ // Reducto Parse has figure bounding boxes
1885
1963
  schemaValidation: d.outputFormat?.features?.schemaValidation ?? isExtract,
1886
1964
  // Extract has schema validation
1887
1965
  handwrittenText: false,
1888
1966
  // Reducto doesn't specifically advertise handwriting
1967
+ headerFooterExtraction: true,
1968
+ // Reducto has Header/Footer block types
1969
+ // Extended features
1970
+ embedOptimized: isParse,
1971
+ // Reducto Parse supports retrieval.embedding_optimized: true
1972
+ passwordProtected: true,
1973
+ // Reducto handles encrypted PDFs
1974
+ contentFiltering: true,
1975
+ // Reducto can filter block types
1976
+ ocrMode: opts.ocrSystem ?? false,
1977
+ // Reducto has ocr_system selection
1978
+ webhookCallback: true,
1979
+ // Reducto supports webhook callbacks
1980
+ mediaResolution: false,
1981
+ changeTracking: true,
1982
+ // Reducto tracks changes in Word docs
1983
+ hyperlinkExtraction: true,
1984
+ // Reducto extracts hyperlinks via formatting.include
1985
+ chartUnderstanding: isParse,
1986
+ // Reducto enhance.agentic[].advanced_chart_agent for figures
1987
+ imageCaptions: false,
1988
+ // Not available in Reducto
1989
+ signatureExtraction: false,
1990
+ // NOT supported - formatting.include only accepts: change_tracking, highlight, comments, hyperlinks
1991
+ commentExtraction: isParse || isExtract,
1992
+ // Reducto formatting.include: ["comments"]
1993
+ highlightExtraction: isParse || isExtract,
1994
+ // Reducto formatting.include: ["highlight"]
1995
+ figureSummaries: isParse,
1996
+ // Reducto enhance.summarize_figures
1889
1997
  outputFormats
1890
1998
  };
1891
1999
  return {
@@ -1971,7 +2079,8 @@ function normalizeUnsiloedProvider(id, d) {
1971
2079
  // Unsiloed doesn't have page range option
1972
2080
  languageHints: false,
1973
2081
  // Unsiloed doesn't support language hints
1974
- processingModes: d.capabilities?.specialFeatures?.includes("YOLO segmentation") ?? false,
2082
+ processingModes: false,
2083
+ // Unsiloed doesn't have fast/balanced/high_accuracy modes like Datalab
1975
2084
  agenticMode: false,
1976
2085
  // Unsiloed doesn't have agentic mode
1977
2086
  customPrompts: false,
@@ -1993,12 +2102,40 @@ function normalizeUnsiloedProvider(id, d) {
1993
2102
  tableMerging: false,
1994
2103
  confidence: d.outputFormat?.features?.confidence ?? false,
1995
2104
  // Unsiloed may provide confidence
1996
- boundingBoxes: d.outputFormat?.features?.boundingBoxes ?? false,
1997
- // Unsiloed may provide bounding boxes
2105
+ boundingBoxes: d.outputFormat?.features?.boundingBoxes ?? isParse,
2106
+ // Unsiloed Parse has bounding boxes
2107
+ imageBoundingBoxes: false,
2108
+ // Unsiloed doesn't return image-specific bboxes
1998
2109
  schemaValidation: isExtract,
1999
2110
  // Extract supports schema validation
2000
- handwrittenText: false,
2001
- // Unsiloed doesn't specifically advertise handwriting
2111
+ handwrittenText: d.capabilities?.specialFeatures?.includes("handwritten text") ?? false,
2112
+ // Parse supports handwriting
2113
+ headerFooterExtraction: false,
2114
+ // Unsiloed doesn't extract header/footer separately
2115
+ // Extended features
2116
+ embedOptimized: false,
2117
+ passwordProtected: false,
2118
+ contentFiltering: isParse,
2119
+ // Parse supports keep_segment_types: ["table", "picture", "formula", "text"]
2120
+ ocrMode: isParse,
2121
+ // Parse endpoint supports ocr_mode: 'auto_ocr' | 'full_ocr'
2122
+ webhookCallback: false,
2123
+ // Unsiloed is synchronous
2124
+ mediaResolution: false,
2125
+ changeTracking: false,
2126
+ hyperlinkExtraction: false,
2127
+ chartUnderstanding: false,
2128
+ // Not available in Unsiloed
2129
+ imageCaptions: false,
2130
+ // Not available in Unsiloed
2131
+ signatureExtraction: false,
2132
+ // Not available in Unsiloed
2133
+ commentExtraction: false,
2134
+ // Not available in Unsiloed
2135
+ highlightExtraction: false,
2136
+ // Not available in Unsiloed
2137
+ figureSummaries: false,
2138
+ // Not available in Unsiloed
2002
2139
  outputFormats
2003
2140
  };
2004
2141
  return {
@@ -2027,7 +2164,8 @@ function normalizeUnsiloedProvider(id, d) {
2027
2164
  supportsImageExtraction: false,
2028
2165
  supportsPageMarkers: false,
2029
2166
  supportsLanguageHints: false,
2030
- supportsProcessingModes: d.capabilities?.specialFeatures?.includes("YOLO segmentation") ?? false,
2167
+ supportsProcessingModes: false,
2168
+ // Unsiloed doesn't have fast/balanced/high_accuracy modes
2031
2169
  supportsSegmentation: isSplit || isCategorize,
2032
2170
  outputFormats
2033
2171
  },
@@ -2245,7 +2383,7 @@ function matchesModelFilter(model, filter) {
2245
2383
  }
2246
2384
  if (filter.hasFeatures && filter.hasFeatures.length > 0) {
2247
2385
  for (const feature of filter.hasFeatures) {
2248
- if (model.features[feature] !== true) {
2386
+ if (!isFeatureEnabled(model.features[feature])) {
2249
2387
  return false;
2250
2388
  }
2251
2389
  }
@@ -2288,6 +2426,186 @@ function getAllModels() {
2288
2426
  function clearModelRegistry() {
2289
2427
  modelRegistry.clear();
2290
2428
  }
2429
+ var PAGE_INDEXING = {
2430
+ datalab: "0-indexed",
2431
+ reducto: "1-indexed",
2432
+ mistral: "0-indexed",
2433
+ unsiloed: "1-indexed",
2434
+ // Default assumption
2435
+ llm: "1-indexed"
2436
+ // N/A but default
2437
+ };
2438
+ function getPageIndexing(provider) {
2439
+ const source = typeof provider === "string" ? provider : provider.source;
2440
+ return PAGE_INDEXING[source] ?? "1-indexed";
2441
+ }
2442
+ function transformDerivedFeatures(options, provider) {
2443
+ const { maxPages, pageRange, ...remainingOptions } = options;
2444
+ const result = { remainingOptions };
2445
+ if (pageRange !== void 0) {
2446
+ result.page_range = pageRange;
2447
+ return result;
2448
+ }
2449
+ if (maxPages !== void 0 && provider.features.maxPages === "derived") {
2450
+ const indexing = getPageIndexing(provider);
2451
+ if (indexing === "0-indexed") {
2452
+ result.page_range = `0-${maxPages - 1}`;
2453
+ if (provider.source === "mistral") {
2454
+ result.pages = Array.from({ length: maxPages }, (_, i) => i);
2455
+ }
2456
+ } else {
2457
+ result.page_range = `1-${maxPages}`;
2458
+ }
2459
+ } else if (maxPages !== void 0 && isFeatureEnabled(provider.features.maxPages)) {
2460
+ result.remainingOptions.maxPages = maxPages;
2461
+ }
2462
+ return result;
2463
+ }
2464
+ function requiresMaxPagesTransformation(provider) {
2465
+ return provider.features.maxPages === "derived";
2466
+ }
2467
+ function normalizeMistralProvider(id, d) {
2468
+ const opts = d.supportedOptions ?? {};
2469
+ const isVLM = d.type === "VLM";
2470
+ const isOCR = d.type === "OCR";
2471
+ const model = d.model ?? id;
2472
+ const outputFormats = {
2473
+ text: true,
2474
+ markdown: d.outputFormat?.features?.markdown ?? isOCR,
2475
+ html: d.outputFormat?.features?.htmlTables ?? isOCR,
2476
+ // OCR 3 can output HTML tables
2477
+ json: d.outputFormat?.features?.structuredJSON ?? isVLM
2478
+ };
2479
+ const features = {
2480
+ maxPages: d.inputFormats?.maxPages !== void 0,
2481
+ pageRange: true,
2482
+ // Mistral supports pages param: "0-5" or [0,2,5] (0-indexed)
2483
+ languageHints: false,
2484
+ // Mistral doesn't support language hints
2485
+ processingModes: false,
2486
+ // Mistral doesn't have processing modes
2487
+ agenticMode: false,
2488
+ // Mistral doesn't have agentic mode
2489
+ customPrompts: false,
2490
+ // Mistral OCR 3 doesn't support custom prompts
2491
+ imageExtraction: opts.includeImageBase64 ?? false,
2492
+ // Can include embedded images
2493
+ pageMarkers: false,
2494
+ // Mistral doesn't add page markers
2495
+ citations: false,
2496
+ // Mistral doesn't provide citations
2497
+ chunking: false,
2498
+ // Mistral doesn't do chunking
2499
+ segmentation: false,
2500
+ // Mistral doesn't do segmentation
2501
+ stripExistingOCR: false,
2502
+ formatLines: false,
2503
+ forceOCR: true,
2504
+ // OCR 3 always does OCR
2505
+ tableOutputFormats: opts.tableFormat ?? isOCR,
2506
+ // html or markdown table format
2507
+ tableMerging: false,
2508
+ confidence: false,
2509
+ // Mistral doesn't provide confidence scores
2510
+ boundingBoxes: false,
2511
+ // Mistral does NOT provide text-level bounding boxes
2512
+ imageBoundingBoxes: true,
2513
+ // Mistral provides image/figure bounding boxes only
2514
+ schemaValidation: d.outputFormat?.features?.schemaValidation ?? isVLM,
2515
+ // VLM supports schema
2516
+ handwrittenText: d.outputFormat?.features?.handwrittenText ?? true,
2517
+ // Excellent handwriting support
2518
+ headerFooterExtraction: opts.extractHeader ?? opts.extractFooter ?? false,
2519
+ // extract_header/extract_footer
2520
+ // Extended features
2521
+ embedOptimized: false,
2522
+ passwordProtected: false,
2523
+ contentFiltering: false,
2524
+ ocrMode: false,
2525
+ webhookCallback: false,
2526
+ // Mistral is synchronous
2527
+ mediaResolution: false,
2528
+ changeTracking: false,
2529
+ hyperlinkExtraction: true,
2530
+ // Response pages[].hyperlinks[] auto-extracted
2531
+ chartUnderstanding: false,
2532
+ // Not available as separate feature in Mistral
2533
+ imageCaptions: false,
2534
+ // Not available in Mistral
2535
+ signatureExtraction: false,
2536
+ // Not available in Mistral
2537
+ commentExtraction: false,
2538
+ // Not available in Mistral
2539
+ highlightExtraction: false,
2540
+ // Not available in Mistral
2541
+ figureSummaries: false,
2542
+ // Not available in Mistral
2543
+ outputFormats
2544
+ };
2545
+ return {
2546
+ id: d.id ?? id,
2547
+ name: d.name ?? id,
2548
+ source: "mistral",
2549
+ type: d.type ?? "OCR",
2550
+ // 3-layer identity
2551
+ identity: {
2552
+ provider: "mistral",
2553
+ model,
2554
+ method: "native"
2555
+ },
2556
+ capabilities: {
2557
+ supportsImages: d.capabilities?.supportsImages ?? true,
2558
+ supportsPDFs: d.capabilities?.supportsPDFs ?? true,
2559
+ supportsDocuments: d.capabilities?.supportsDocuments ?? true,
2560
+ // Supports DOCX, PPTX, TXT, EPUB, RTF, ODT, etc. (NOT XLSX)
2561
+ supportsReasoning: false,
2562
+ // OCR 3 doesn't do reasoning
2563
+ supportsStructuredOutput: d.capabilities?.supportsStructuredOutput ?? isVLM,
2564
+ // Extended capabilities
2565
+ supportsPrompts: false,
2566
+ supportsCitations: false,
2567
+ supportsChunking: false,
2568
+ supportsImageExtraction: opts.includeImageBase64 ?? false,
2569
+ supportsPageMarkers: false,
2570
+ supportsLanguageHints: false,
2571
+ supportsProcessingModes: false,
2572
+ supportsSegmentation: false,
2573
+ outputFormats
2574
+ },
2575
+ features,
2576
+ // Mistral providers always need raw document input
2577
+ inputRequirements: {
2578
+ inputType: d.inputRequirements?.inputType ?? "raw-document",
2579
+ acceptedMethods: d.inputRequirements?.acceptedMethods ?? ["base64", "url"]
2580
+ },
2581
+ compatibleNodes: {
2582
+ parse: d.compatibleNodes?.parse ?? isOCR,
2583
+ extract: d.compatibleNodes?.extract ?? isVLM,
2584
+ categorize: d.compatibleNodes?.categorize ?? false,
2585
+ qualify: d.compatibleNodes?.qualify ?? false,
2586
+ split: d.compatibleNodes?.split ?? false
2587
+ },
2588
+ inputFormats: {
2589
+ imageMimeTypes: (d.inputFormats?.mimeTypes ?? []).filter((m) => m.startsWith("image/")),
2590
+ documentMimeTypes: (d.inputFormats?.mimeTypes ?? []).filter((m) => !m.startsWith("image/")),
2591
+ inputMethods: d.inputFormats?.inputMethods ?? ["base64", "url"],
2592
+ maxFileSize: d.inputFormats?.maxFileSize ?? 50,
2593
+ // 50MB limit
2594
+ maxPages: d.inputFormats?.maxPages ?? 1e3
2595
+ },
2596
+ pricing: {
2597
+ model: "per-page",
2598
+ perPage: d.pricing?.perPage ?? 2e-3,
2599
+ // $2/1000 pages
2600
+ currency: "USD",
2601
+ notes: d.pricing?.notes ?? "$2 per 1000 pages"
2602
+ },
2603
+ rateLimits: {
2604
+ docsPerMinute: d.apiConfig?.rateLimit?.docsPerMinute
2605
+ },
2606
+ raw: d
2607
+ };
2608
+ }
2291
2609
 
2292
2610
  // src/retry.ts
2293
2611
  var DEFAULT_RETRY_CONFIG = {
@@ -2507,6 +2825,7 @@ export {
2507
2825
  getNodeTypeName,
2508
2826
  getPDFPageCount,
2509
2827
  getPageCountMetadata,
2828
+ getPageIndexing,
2510
2829
  getProviderById,
2511
2830
  getProvidersBySource,
2512
2831
  getProvidersForLargeFiles,
@@ -2514,6 +2833,7 @@ export {
2514
2833
  getSuggestedConnections,
2515
2834
  getTotalPageCount,
2516
2835
  getValidForEachStarters,
2836
+ isFeatureEnabled,
2517
2837
  isLocalEndpoint,
2518
2838
  isPDFDocument,
2519
2839
  isRetryableError,
@@ -2525,11 +2845,13 @@ export {
2525
2845
  queryProviders,
2526
2846
  registerProviderMetadata,
2527
2847
  registerProviderWithModels,
2848
+ requiresMaxPagesTransformation,
2528
2849
  resolveDocument,
2529
2850
  resolveModelMetadata,
2530
2851
  runPipeline,
2531
2852
  splitPDFIntoChunks,
2532
2853
  toProviderString,
2854
+ transformDerivedFeatures,
2533
2855
  validateFlowInputFormat,
2534
2856
  validateJson,
2535
2857
  validateMimeType,