@doclo/core 0.1.12 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -1398,6 +1398,9 @@ function createIdentity(provider, model, opts) {
1398
1398
  }
1399
1399
 
1400
1400
  // src/provider-query.ts
1401
+ function isFeatureEnabled(status) {
1402
+ return status === true || status === "deprecated" || status === "derived";
1403
+ }
1401
1404
  var providerRegistry = /* @__PURE__ */ new Map();
1402
1405
  function registerProviderMetadata(source, metadata, normalizer) {
1403
1406
  const normalized = /* @__PURE__ */ new Map();
@@ -1482,7 +1485,7 @@ function queryProviders(filter = {}) {
1482
1485
  }
1483
1486
  if (filter.hasFeatures && filter.hasFeatures.length > 0) {
1484
1487
  providers = providers.filter(
1485
- (p) => filter.hasFeatures.every((feature) => p.features[feature] === true)
1488
+ (p) => filter.hasFeatures.every((feature) => isFeatureEnabled(p.features[feature]))
1486
1489
  );
1487
1490
  }
1488
1491
  if (filter.outputFormat) {
@@ -1581,9 +1584,25 @@ function defaultNormalizer(id, data, source) {
1581
1584
  tableMerging: false,
1582
1585
  confidence: false,
1583
1586
  boundingBoxes: false,
1587
+ imageBoundingBoxes: false,
1584
1588
  schemaValidation: false,
1585
1589
  handwrittenText: false,
1586
1590
  headerFooterExtraction: false,
1591
+ // Extended features
1592
+ embedOptimized: false,
1593
+ passwordProtected: false,
1594
+ contentFiltering: false,
1595
+ ocrMode: false,
1596
+ webhookCallback: false,
1597
+ mediaResolution: false,
1598
+ changeTracking: false,
1599
+ hyperlinkExtraction: false,
1600
+ chartUnderstanding: false,
1601
+ imageCaptions: false,
1602
+ signatureExtraction: false,
1603
+ commentExtraction: false,
1604
+ highlightExtraction: false,
1605
+ figureSummaries: false,
1587
1606
  outputFormats: defaultOutputFormats
1588
1607
  };
1589
1608
  return {
@@ -1638,10 +1657,12 @@ function normalizeLLMProvider(id, d) {
1638
1657
  html: true,
1639
1658
  json: d.capabilities?.supportsStructuredOutput ?? true
1640
1659
  };
1660
+ const vendor = d.vendor ?? id;
1641
1661
  const features = {
1642
- maxPages: d.inputFormats?.pdfs?.maxPages !== void 0,
1643
- pageRange: true,
1644
- // LLMs can handle page ranges
1662
+ maxPages: "derived",
1663
+ // SDK can limit via pre-processing
1664
+ pageRange: false,
1665
+ // No native API support - LLMs receive full text
1645
1666
  languageHints: false,
1646
1667
  // Not applicable to LLMs
1647
1668
  processingModes: false,
@@ -1654,8 +1675,8 @@ function normalizeLLMProvider(id, d) {
1654
1675
  // LLMs don't extract images
1655
1676
  pageMarkers: false,
1656
1677
  // LLMs don't add page markers
1657
- citations: false,
1658
- // Most LLMs don't have native citations (Anthropic has different API)
1678
+ citations: vendor === "anthropic" ? true : false,
1679
+ // Anthropic has Citations API
1659
1680
  chunking: false,
1660
1681
  // LLMs don't do chunking
1661
1682
  segmentation: false,
@@ -1669,15 +1690,32 @@ function normalizeLLMProvider(id, d) {
1669
1690
  // LLMs don't provide confidence scores
1670
1691
  boundingBoxes: false,
1671
1692
  // LLMs don't provide bounding boxes
1693
+ imageBoundingBoxes: false,
1694
+ // LLMs don't provide image bounding boxes (Gemini 2.0+ can via specific prompting, but not a simple toggle)
1672
1695
  schemaValidation: d.capabilities?.supportsStructuredOutput ?? false,
1673
1696
  // Some LLMs support schema validation
1674
1697
  handwrittenText: false,
1675
1698
  // Not specific to LLMs
1676
1699
  headerFooterExtraction: false,
1677
1700
  // LLMs don't extract header/footer separately
1701
+ // Extended features
1702
+ embedOptimized: false,
1703
+ passwordProtected: false,
1704
+ contentFiltering: false,
1705
+ ocrMode: false,
1706
+ webhookCallback: false,
1707
+ mediaResolution: vendor === "google" ? true : false,
1708
+ // Google Gemini has mediaResolution
1709
+ changeTracking: false,
1710
+ hyperlinkExtraction: false,
1711
+ chartUnderstanding: false,
1712
+ imageCaptions: false,
1713
+ signatureExtraction: false,
1714
+ commentExtraction: false,
1715
+ highlightExtraction: false,
1716
+ figureSummaries: false,
1678
1717
  outputFormats
1679
1718
  };
1680
- const vendor = d.vendor ?? id;
1681
1719
  return {
1682
1720
  id,
1683
1721
  name: d.name ?? id,
@@ -1698,7 +1736,8 @@ function normalizeLLMProvider(id, d) {
1698
1736
  supportsStructuredOutput: d.capabilities?.supportsStructuredOutput ?? false,
1699
1737
  // NEW capabilities
1700
1738
  supportsPrompts: true,
1701
- supportsCitations: false,
1739
+ supportsCitations: vendor === "anthropic",
1740
+ // Anthropic has Citations API
1702
1741
  supportsChunking: false,
1703
1742
  supportsImageExtraction: false,
1704
1743
  supportsPageMarkers: false,
@@ -1745,6 +1784,8 @@ function normalizeLLMProvider(id, d) {
1745
1784
  function normalizeDatalabProvider(id, d) {
1746
1785
  const opts = d.supportedOptions ?? {};
1747
1786
  const isVLM = d.type === "VLM";
1787
+ const isMarkerOCR = id === "marker-ocr" || id.includes("marker-ocr");
1788
+ const isMarkerVLM = id === "marker-vlm" || id.includes("marker-vlm");
1748
1789
  const model = d.model ?? id;
1749
1790
  const outputFormats = {
1750
1791
  text: true,
@@ -1755,35 +1796,61 @@ function normalizeDatalabProvider(id, d) {
1755
1796
  const features = {
1756
1797
  maxPages: opts.maxPages ?? false,
1757
1798
  pageRange: opts.pageRange ?? false,
1758
- languageHints: opts.langs ?? false,
1759
- // maps from 'langs'
1799
+ languageHints: opts.langs ? "deprecated" : false,
1800
+ // API ignores, handled automatically
1760
1801
  processingModes: opts.mode ?? false,
1761
1802
  agenticMode: false,
1762
1803
  // Datalab doesn't have agentic mode
1763
- customPrompts: opts.blockCorrectionPrompt ?? false,
1804
+ customPrompts: opts.blockCorrectionPrompt ? "deprecated" : false,
1805
+ // Not currently supported
1764
1806
  imageExtraction: opts.extractImages ?? false,
1765
1807
  pageMarkers: opts.paginate ?? false,
1766
1808
  // maps from 'paginate'
1767
- citations: opts.citations ?? false,
1809
+ citations: isMarkerVLM ? true : false,
1810
+ // Marker VLM has citations
1768
1811
  chunking: false,
1769
1812
  // Datalab doesn't have chunking
1770
1813
  segmentation: opts.segmentation ?? false,
1771
- stripExistingOCR: opts.stripExistingOCR ?? false,
1772
- formatLines: opts.formatLines ?? false,
1773
- forceOCR: true,
1774
- // Datalab supports force_ocr
1814
+ stripExistingOCR: opts.stripExistingOCR ? "deprecated" : false,
1815
+ // Managed automatically
1816
+ formatLines: opts.formatLines ? "deprecated" : false,
1817
+ // Handled automatically
1818
+ forceOCR: "deprecated",
1819
+ // DEPRECATED: force_ocr param has no effect per API docs
1775
1820
  tableOutputFormats: false,
1776
1821
  tableMerging: false,
1777
1822
  confidence: false,
1778
1823
  // Datalab doesn't provide confidence scores
1779
1824
  boundingBoxes: d.outputFormat?.features?.boundingBoxes ?? true,
1780
- // Datalab provides bounding boxes
1825
+ // Datalab Surya provides text bboxes
1826
+ imageBoundingBoxes: isMarkerOCR || isMarkerVLM ? true : false,
1827
+ // Marker extracts images with bboxes
1781
1828
  schemaValidation: isVLM,
1782
1829
  // VLM providers support schema validation
1783
1830
  handwrittenText: true,
1784
1831
  // Datalab handles handwritten text
1785
1832
  headerFooterExtraction: false,
1786
1833
  // Datalab has issues with header/footer extraction
1834
+ // Extended features
1835
+ embedOptimized: false,
1836
+ passwordProtected: false,
1837
+ contentFiltering: false,
1838
+ ocrMode: false,
1839
+ webhookCallback: true,
1840
+ // Datalab supports webhook callbacks
1841
+ mediaResolution: false,
1842
+ changeTracking: true,
1843
+ // Datalab marker_extras supports track_changes
1844
+ hyperlinkExtraction: isMarkerOCR || isMarkerVLM,
1845
+ // Datalab extras=extract_links
1846
+ chartUnderstanding: isMarkerOCR || isMarkerVLM,
1847
+ // Datalab extras=chart_understanding
1848
+ imageCaptions: isMarkerOCR || isMarkerVLM,
1849
+ // Datalab disable_image_captions param
1850
+ signatureExtraction: false,
1851
+ commentExtraction: false,
1852
+ highlightExtraction: false,
1853
+ figureSummaries: false,
1787
1854
  outputFormats
1788
1855
  };
1789
1856
  return {
@@ -1852,6 +1919,7 @@ function normalizeReductoProvider(id, d) {
1852
1919
  const opts = d.supportedOptions ?? {};
1853
1920
  const isVLM = d.type === "VLM";
1854
1921
  const isExtract = d.compatibleNodes?.extract === true;
1922
+ const isParse = d.compatibleNodes?.parse === true;
1855
1923
  const model = d.model ?? "v1";
1856
1924
  const outputFormats = {
1857
1925
  text: d.outputFormat?.features?.textLines ?? true,
@@ -1861,10 +1929,11 @@ function normalizeReductoProvider(id, d) {
1861
1929
  json: d.outputFormat?.features?.structuredJSON ?? isExtract
1862
1930
  };
1863
1931
  const features = {
1864
- maxPages: opts.maxPages ?? false,
1932
+ maxPages: opts.pageRange ?? false ? "derived" : false,
1933
+ // SDK derives from pageRange (1-indexed)
1865
1934
  pageRange: opts.pageRange ?? false,
1866
- languageHints: opts.langs ?? false,
1867
- // Reducto doesn't support langs
1935
+ languageHints: false,
1936
+ // Reducto doesn't support language hints
1868
1937
  processingModes: false,
1869
1938
  // Reducto uses agentic instead
1870
1939
  agenticMode: opts.mode ?? false,
@@ -1887,14 +1956,44 @@ function normalizeReductoProvider(id, d) {
1887
1956
  // Parse has mergeTables
1888
1957
  confidence: opts.confidence ?? d.outputFormat?.features?.confidence ?? false,
1889
1958
  // Reducto Parse has confidence
1890
- boundingBoxes: d.outputFormat?.features?.boundingBoxes ?? d.compatibleNodes?.parse ?? false,
1891
- // Reducto Parse has bounding boxes
1959
+ boundingBoxes: d.outputFormat?.features?.boundingBoxes ?? isParse,
1960
+ // Reducto Parse has text bounding boxes
1961
+ imageBoundingBoxes: isParse ? true : false,
1962
+ // Reducto Parse has figure bounding boxes
1892
1963
  schemaValidation: d.outputFormat?.features?.schemaValidation ?? isExtract,
1893
1964
  // Extract has schema validation
1894
1965
  handwrittenText: false,
1895
1966
  // Reducto doesn't specifically advertise handwriting
1896
1967
  headerFooterExtraction: true,
1897
1968
  // Reducto has Header/Footer block types
1969
+ // Extended features
1970
+ embedOptimized: isParse,
1971
+ // Reducto Parse supports retrieval.embedding_optimized: true
1972
+ passwordProtected: true,
1973
+ // Reducto handles encrypted PDFs
1974
+ contentFiltering: true,
1975
+ // Reducto can filter block types
1976
+ ocrMode: opts.ocrSystem ?? false,
1977
+ // Reducto has ocr_system selection
1978
+ webhookCallback: true,
1979
+ // Reducto supports webhook callbacks
1980
+ mediaResolution: false,
1981
+ changeTracking: true,
1982
+ // Reducto tracks changes in Word docs
1983
+ hyperlinkExtraction: true,
1984
+ // Reducto extracts hyperlinks via formatting.include
1985
+ chartUnderstanding: isParse,
1986
+ // Reducto enhance.agentic[].advanced_chart_agent for figures
1987
+ imageCaptions: false,
1988
+ // Not available in Reducto
1989
+ signatureExtraction: false,
1990
+ // NOT supported - formatting.include only accepts: change_tracking, highlight, comments, hyperlinks
1991
+ commentExtraction: isParse || isExtract,
1992
+ // Reducto formatting.include: ["comments"]
1993
+ highlightExtraction: isParse || isExtract,
1994
+ // Reducto formatting.include: ["highlight"]
1995
+ figureSummaries: isParse,
1996
+ // Reducto enhance.summarize_figures
1898
1997
  outputFormats
1899
1998
  };
1900
1999
  return {
@@ -1980,7 +2079,8 @@ function normalizeUnsiloedProvider(id, d) {
1980
2079
  // Unsiloed doesn't have page range option
1981
2080
  languageHints: false,
1982
2081
  // Unsiloed doesn't support language hints
1983
- processingModes: d.capabilities?.specialFeatures?.includes("YOLO segmentation") ?? false,
2082
+ processingModes: false,
2083
+ // Unsiloed doesn't have fast/balanced/high_accuracy modes like Datalab
1984
2084
  agenticMode: false,
1985
2085
  // Unsiloed doesn't have agentic mode
1986
2086
  customPrompts: false,
@@ -2002,14 +2102,40 @@ function normalizeUnsiloedProvider(id, d) {
2002
2102
  tableMerging: false,
2003
2103
  confidence: d.outputFormat?.features?.confidence ?? false,
2004
2104
  // Unsiloed may provide confidence
2005
- boundingBoxes: d.outputFormat?.features?.boundingBoxes ?? false,
2006
- // Unsiloed may provide bounding boxes
2105
+ boundingBoxes: d.outputFormat?.features?.boundingBoxes ?? isParse,
2106
+ // Unsiloed Parse has bounding boxes
2107
+ imageBoundingBoxes: false,
2108
+ // Unsiloed doesn't return image-specific bboxes
2007
2109
  schemaValidation: isExtract,
2008
2110
  // Extract supports schema validation
2009
- handwrittenText: false,
2010
- // Unsiloed doesn't specifically advertise handwriting
2111
+ handwrittenText: d.capabilities?.specialFeatures?.includes("handwritten text") ?? false,
2112
+ // Parse supports handwriting
2011
2113
  headerFooterExtraction: false,
2012
2114
  // Unsiloed doesn't extract header/footer separately
2115
+ // Extended features
2116
+ embedOptimized: false,
2117
+ passwordProtected: false,
2118
+ contentFiltering: isParse,
2119
+ // Parse supports keep_segment_types: ["table", "picture", "formula", "text"]
2120
+ ocrMode: isParse,
2121
+ // Parse endpoint supports ocr_mode: 'auto_ocr' | 'full_ocr'
2122
+ webhookCallback: false,
2123
+ // Unsiloed is synchronous
2124
+ mediaResolution: false,
2125
+ changeTracking: false,
2126
+ hyperlinkExtraction: false,
2127
+ chartUnderstanding: false,
2128
+ // Not available in Unsiloed
2129
+ imageCaptions: false,
2130
+ // Not available in Unsiloed
2131
+ signatureExtraction: false,
2132
+ // Not available in Unsiloed
2133
+ commentExtraction: false,
2134
+ // Not available in Unsiloed
2135
+ highlightExtraction: false,
2136
+ // Not available in Unsiloed
2137
+ figureSummaries: false,
2138
+ // Not available in Unsiloed
2013
2139
  outputFormats
2014
2140
  };
2015
2141
  return {
@@ -2038,7 +2164,8 @@ function normalizeUnsiloedProvider(id, d) {
2038
2164
  supportsImageExtraction: false,
2039
2165
  supportsPageMarkers: false,
2040
2166
  supportsLanguageHints: false,
2041
- supportsProcessingModes: d.capabilities?.specialFeatures?.includes("YOLO segmentation") ?? false,
2167
+ supportsProcessingModes: false,
2168
+ // Unsiloed doesn't have fast/balanced/high_accuracy modes
2042
2169
  supportsSegmentation: isSplit || isCategorize,
2043
2170
  outputFormats
2044
2171
  },
@@ -2256,7 +2383,7 @@ function matchesModelFilter(model, filter) {
2256
2383
  }
2257
2384
  if (filter.hasFeatures && filter.hasFeatures.length > 0) {
2258
2385
  for (const feature of filter.hasFeatures) {
2259
- if (model.features[feature] !== true) {
2386
+ if (!isFeatureEnabled(model.features[feature])) {
2260
2387
  return false;
2261
2388
  }
2262
2389
  }
@@ -2299,6 +2426,44 @@ function getAllModels() {
2299
2426
  function clearModelRegistry() {
2300
2427
  modelRegistry.clear();
2301
2428
  }
2429
+ var PAGE_INDEXING = {
2430
+ datalab: "0-indexed",
2431
+ reducto: "1-indexed",
2432
+ mistral: "0-indexed",
2433
+ unsiloed: "1-indexed",
2434
+ // Default assumption
2435
+ llm: "1-indexed"
2436
+ // N/A but default
2437
+ };
2438
+ function getPageIndexing(provider) {
2439
+ const source = typeof provider === "string" ? provider : provider.source;
2440
+ return PAGE_INDEXING[source] ?? "1-indexed";
2441
+ }
2442
+ function transformDerivedFeatures(options, provider) {
2443
+ const { maxPages, pageRange, ...remainingOptions } = options;
2444
+ const result = { remainingOptions };
2445
+ if (pageRange !== void 0) {
2446
+ result.page_range = pageRange;
2447
+ return result;
2448
+ }
2449
+ if (maxPages !== void 0 && provider.features.maxPages === "derived") {
2450
+ const indexing = getPageIndexing(provider);
2451
+ if (indexing === "0-indexed") {
2452
+ result.page_range = `0-${maxPages - 1}`;
2453
+ if (provider.source === "mistral") {
2454
+ result.pages = Array.from({ length: maxPages }, (_, i) => i);
2455
+ }
2456
+ } else {
2457
+ result.page_range = `1-${maxPages}`;
2458
+ }
2459
+ } else if (maxPages !== void 0 && isFeatureEnabled(provider.features.maxPages)) {
2460
+ result.remainingOptions.maxPages = maxPages;
2461
+ }
2462
+ return result;
2463
+ }
2464
+ function requiresMaxPagesTransformation(provider) {
2465
+ return provider.features.maxPages === "derived";
2466
+ }
2302
2467
  function normalizeMistralProvider(id, d) {
2303
2468
  const opts = d.supportedOptions ?? {};
2304
2469
  const isVLM = d.type === "VLM";
@@ -2314,7 +2479,7 @@ function normalizeMistralProvider(id, d) {
2314
2479
  const features = {
2315
2480
  maxPages: d.inputFormats?.maxPages !== void 0,
2316
2481
  pageRange: true,
2317
- // Mistral supports pages param: "0-5" or [0,2,5]
2482
+ // Mistral supports pages param: "0-5" or [0,2,5] (0-indexed)
2318
2483
  languageHints: false,
2319
2484
  // Mistral doesn't support language hints
2320
2485
  processingModes: false,
@@ -2342,14 +2507,39 @@ function normalizeMistralProvider(id, d) {
2342
2507
  tableMerging: false,
2343
2508
  confidence: false,
2344
2509
  // Mistral doesn't provide confidence scores
2345
- boundingBoxes: d.outputFormat?.features?.boundingBoxes ?? false,
2346
- // NO text-level bboxes
2510
+ boundingBoxes: false,
2511
+ // Mistral does NOT provide text-level bounding boxes
2512
+ imageBoundingBoxes: true,
2513
+ // Mistral provides image/figure bounding boxes only
2347
2514
  schemaValidation: d.outputFormat?.features?.schemaValidation ?? isVLM,
2348
2515
  // VLM supports schema
2349
2516
  handwrittenText: d.outputFormat?.features?.handwrittenText ?? true,
2350
2517
  // Excellent handwriting support
2351
2518
  headerFooterExtraction: opts.extractHeader ?? opts.extractFooter ?? false,
2352
2519
  // extract_header/extract_footer
2520
+ // Extended features
2521
+ embedOptimized: false,
2522
+ passwordProtected: false,
2523
+ contentFiltering: false,
2524
+ ocrMode: false,
2525
+ webhookCallback: false,
2526
+ // Mistral is synchronous
2527
+ mediaResolution: false,
2528
+ changeTracking: false,
2529
+ hyperlinkExtraction: true,
2530
+ // Response pages[].hyperlinks[] auto-extracted
2531
+ chartUnderstanding: false,
2532
+ // Not available as separate feature in Mistral
2533
+ imageCaptions: false,
2534
+ // Not available in Mistral
2535
+ signatureExtraction: false,
2536
+ // Not available in Mistral
2537
+ commentExtraction: false,
2538
+ // Not available in Mistral
2539
+ highlightExtraction: false,
2540
+ // Not available in Mistral
2541
+ figureSummaries: false,
2542
+ // Not available in Mistral
2353
2543
  outputFormats
2354
2544
  };
2355
2545
  return {
@@ -2366,8 +2556,8 @@ function normalizeMistralProvider(id, d) {
2366
2556
  capabilities: {
2367
2557
  supportsImages: d.capabilities?.supportsImages ?? true,
2368
2558
  supportsPDFs: d.capabilities?.supportsPDFs ?? true,
2369
- supportsDocuments: d.capabilities?.supportsDocuments ?? false,
2370
- // DOCX/PPTX has known issues
2559
+ supportsDocuments: d.capabilities?.supportsDocuments ?? true,
2560
+ // Supports DOCX, PPTX, TXT, EPUB, RTF, ODT, etc. (NOT XLSX)
2371
2561
  supportsReasoning: false,
2372
2562
  // OCR 3 doesn't do reasoning
2373
2563
  supportsStructuredOutput: d.capabilities?.supportsStructuredOutput ?? isVLM,
@@ -2635,6 +2825,7 @@ export {
2635
2825
  getNodeTypeName,
2636
2826
  getPDFPageCount,
2637
2827
  getPageCountMetadata,
2828
+ getPageIndexing,
2638
2829
  getProviderById,
2639
2830
  getProvidersBySource,
2640
2831
  getProvidersForLargeFiles,
@@ -2642,6 +2833,7 @@ export {
2642
2833
  getSuggestedConnections,
2643
2834
  getTotalPageCount,
2644
2835
  getValidForEachStarters,
2836
+ isFeatureEnabled,
2645
2837
  isLocalEndpoint,
2646
2838
  isPDFDocument,
2647
2839
  isRetryableError,
@@ -2653,11 +2845,13 @@ export {
2653
2845
  queryProviders,
2654
2846
  registerProviderMetadata,
2655
2847
  registerProviderWithModels,
2848
+ requiresMaxPagesTransformation,
2656
2849
  resolveDocument,
2657
2850
  resolveModelMetadata,
2658
2851
  runPipeline,
2659
2852
  splitPDFIntoChunks,
2660
2853
  toProviderString,
2854
+ transformDerivedFeatures,
2661
2855
  validateFlowInputFormat,
2662
2856
  validateJson,
2663
2857
  validateMimeType,