@doclo/core 0.1.12 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +130 -26
- package/dist/index.js +228 -34
- package/dist/index.js.map +1 -1
- package/dist/internal/validation-utils.d.ts +1 -1
- package/dist/internal/validation-utils.js.map +1 -1
- package/dist/pdf-utils.d.ts +1 -1
- package/dist/{validation-wlK06puw.d.ts → validation-B8GRTtww.d.ts} +43 -2
- package/dist/validation.d.ts +1 -1
- package/dist/validation.js.map +1 -1
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -1398,6 +1398,9 @@ function createIdentity(provider, model, opts) {
|
|
|
1398
1398
|
}
|
|
1399
1399
|
|
|
1400
1400
|
// src/provider-query.ts
|
|
1401
|
+
function isFeatureEnabled(status) {
|
|
1402
|
+
return status === true || status === "deprecated" || status === "derived";
|
|
1403
|
+
}
|
|
1401
1404
|
var providerRegistry = /* @__PURE__ */ new Map();
|
|
1402
1405
|
function registerProviderMetadata(source, metadata, normalizer) {
|
|
1403
1406
|
const normalized = /* @__PURE__ */ new Map();
|
|
@@ -1482,7 +1485,7 @@ function queryProviders(filter = {}) {
|
|
|
1482
1485
|
}
|
|
1483
1486
|
if (filter.hasFeatures && filter.hasFeatures.length > 0) {
|
|
1484
1487
|
providers = providers.filter(
|
|
1485
|
-
(p) => filter.hasFeatures.every((feature) => p.features[feature]
|
|
1488
|
+
(p) => filter.hasFeatures.every((feature) => isFeatureEnabled(p.features[feature]))
|
|
1486
1489
|
);
|
|
1487
1490
|
}
|
|
1488
1491
|
if (filter.outputFormat) {
|
|
@@ -1581,9 +1584,25 @@ function defaultNormalizer(id, data, source) {
|
|
|
1581
1584
|
tableMerging: false,
|
|
1582
1585
|
confidence: false,
|
|
1583
1586
|
boundingBoxes: false,
|
|
1587
|
+
imageBoundingBoxes: false,
|
|
1584
1588
|
schemaValidation: false,
|
|
1585
1589
|
handwrittenText: false,
|
|
1586
1590
|
headerFooterExtraction: false,
|
|
1591
|
+
// Extended features
|
|
1592
|
+
embedOptimized: false,
|
|
1593
|
+
passwordProtected: false,
|
|
1594
|
+
contentFiltering: false,
|
|
1595
|
+
ocrMode: false,
|
|
1596
|
+
webhookCallback: false,
|
|
1597
|
+
mediaResolution: false,
|
|
1598
|
+
changeTracking: false,
|
|
1599
|
+
hyperlinkExtraction: false,
|
|
1600
|
+
chartUnderstanding: false,
|
|
1601
|
+
imageCaptions: false,
|
|
1602
|
+
signatureExtraction: false,
|
|
1603
|
+
commentExtraction: false,
|
|
1604
|
+
highlightExtraction: false,
|
|
1605
|
+
figureSummaries: false,
|
|
1587
1606
|
outputFormats: defaultOutputFormats
|
|
1588
1607
|
};
|
|
1589
1608
|
return {
|
|
@@ -1638,10 +1657,12 @@ function normalizeLLMProvider(id, d) {
|
|
|
1638
1657
|
html: true,
|
|
1639
1658
|
json: d.capabilities?.supportsStructuredOutput ?? true
|
|
1640
1659
|
};
|
|
1660
|
+
const vendor = d.vendor ?? id;
|
|
1641
1661
|
const features = {
|
|
1642
|
-
maxPages:
|
|
1643
|
-
|
|
1644
|
-
|
|
1662
|
+
maxPages: "derived",
|
|
1663
|
+
// SDK can limit via pre-processing
|
|
1664
|
+
pageRange: false,
|
|
1665
|
+
// No native API support - LLMs receive full text
|
|
1645
1666
|
languageHints: false,
|
|
1646
1667
|
// Not applicable to LLMs
|
|
1647
1668
|
processingModes: false,
|
|
@@ -1654,8 +1675,8 @@ function normalizeLLMProvider(id, d) {
|
|
|
1654
1675
|
// LLMs don't extract images
|
|
1655
1676
|
pageMarkers: false,
|
|
1656
1677
|
// LLMs don't add page markers
|
|
1657
|
-
citations: false,
|
|
1658
|
-
//
|
|
1678
|
+
citations: vendor === "anthropic" ? true : false,
|
|
1679
|
+
// Anthropic has Citations API
|
|
1659
1680
|
chunking: false,
|
|
1660
1681
|
// LLMs don't do chunking
|
|
1661
1682
|
segmentation: false,
|
|
@@ -1669,15 +1690,32 @@ function normalizeLLMProvider(id, d) {
|
|
|
1669
1690
|
// LLMs don't provide confidence scores
|
|
1670
1691
|
boundingBoxes: false,
|
|
1671
1692
|
// LLMs don't provide bounding boxes
|
|
1693
|
+
imageBoundingBoxes: false,
|
|
1694
|
+
// LLMs don't provide image bounding boxes (Gemini 2.0+ can via specific prompting, but not a simple toggle)
|
|
1672
1695
|
schemaValidation: d.capabilities?.supportsStructuredOutput ?? false,
|
|
1673
1696
|
// Some LLMs support schema validation
|
|
1674
1697
|
handwrittenText: false,
|
|
1675
1698
|
// Not specific to LLMs
|
|
1676
1699
|
headerFooterExtraction: false,
|
|
1677
1700
|
// LLMs don't extract header/footer separately
|
|
1701
|
+
// Extended features
|
|
1702
|
+
embedOptimized: false,
|
|
1703
|
+
passwordProtected: false,
|
|
1704
|
+
contentFiltering: false,
|
|
1705
|
+
ocrMode: false,
|
|
1706
|
+
webhookCallback: false,
|
|
1707
|
+
mediaResolution: vendor === "google" ? true : false,
|
|
1708
|
+
// Google Gemini has mediaResolution
|
|
1709
|
+
changeTracking: false,
|
|
1710
|
+
hyperlinkExtraction: false,
|
|
1711
|
+
chartUnderstanding: false,
|
|
1712
|
+
imageCaptions: false,
|
|
1713
|
+
signatureExtraction: false,
|
|
1714
|
+
commentExtraction: false,
|
|
1715
|
+
highlightExtraction: false,
|
|
1716
|
+
figureSummaries: false,
|
|
1678
1717
|
outputFormats
|
|
1679
1718
|
};
|
|
1680
|
-
const vendor = d.vendor ?? id;
|
|
1681
1719
|
return {
|
|
1682
1720
|
id,
|
|
1683
1721
|
name: d.name ?? id,
|
|
@@ -1698,7 +1736,8 @@ function normalizeLLMProvider(id, d) {
|
|
|
1698
1736
|
supportsStructuredOutput: d.capabilities?.supportsStructuredOutput ?? false,
|
|
1699
1737
|
// NEW capabilities
|
|
1700
1738
|
supportsPrompts: true,
|
|
1701
|
-
supportsCitations:
|
|
1739
|
+
supportsCitations: vendor === "anthropic",
|
|
1740
|
+
// Anthropic has Citations API
|
|
1702
1741
|
supportsChunking: false,
|
|
1703
1742
|
supportsImageExtraction: false,
|
|
1704
1743
|
supportsPageMarkers: false,
|
|
@@ -1745,6 +1784,8 @@ function normalizeLLMProvider(id, d) {
|
|
|
1745
1784
|
function normalizeDatalabProvider(id, d) {
|
|
1746
1785
|
const opts = d.supportedOptions ?? {};
|
|
1747
1786
|
const isVLM = d.type === "VLM";
|
|
1787
|
+
const isMarkerOCR = id === "marker-ocr" || id.includes("marker-ocr");
|
|
1788
|
+
const isMarkerVLM = id === "marker-vlm" || id.includes("marker-vlm");
|
|
1748
1789
|
const model = d.model ?? id;
|
|
1749
1790
|
const outputFormats = {
|
|
1750
1791
|
text: true,
|
|
@@ -1755,35 +1796,61 @@ function normalizeDatalabProvider(id, d) {
|
|
|
1755
1796
|
const features = {
|
|
1756
1797
|
maxPages: opts.maxPages ?? false,
|
|
1757
1798
|
pageRange: opts.pageRange ?? false,
|
|
1758
|
-
languageHints: opts.langs
|
|
1759
|
-
//
|
|
1799
|
+
languageHints: opts.langs ? "deprecated" : false,
|
|
1800
|
+
// API ignores, handled automatically
|
|
1760
1801
|
processingModes: opts.mode ?? false,
|
|
1761
1802
|
agenticMode: false,
|
|
1762
1803
|
// Datalab doesn't have agentic mode
|
|
1763
|
-
customPrompts: opts.blockCorrectionPrompt
|
|
1804
|
+
customPrompts: opts.blockCorrectionPrompt ? "deprecated" : false,
|
|
1805
|
+
// Not currently supported
|
|
1764
1806
|
imageExtraction: opts.extractImages ?? false,
|
|
1765
1807
|
pageMarkers: opts.paginate ?? false,
|
|
1766
1808
|
// maps from 'paginate'
|
|
1767
|
-
citations:
|
|
1809
|
+
citations: isMarkerVLM ? true : false,
|
|
1810
|
+
// Marker VLM has citations
|
|
1768
1811
|
chunking: false,
|
|
1769
1812
|
// Datalab doesn't have chunking
|
|
1770
1813
|
segmentation: opts.segmentation ?? false,
|
|
1771
|
-
stripExistingOCR: opts.stripExistingOCR
|
|
1772
|
-
|
|
1773
|
-
|
|
1774
|
-
//
|
|
1814
|
+
stripExistingOCR: opts.stripExistingOCR ? "deprecated" : false,
|
|
1815
|
+
// Managed automatically
|
|
1816
|
+
formatLines: opts.formatLines ? "deprecated" : false,
|
|
1817
|
+
// Handled automatically
|
|
1818
|
+
forceOCR: "deprecated",
|
|
1819
|
+
// DEPRECATED: force_ocr param has no effect per API docs
|
|
1775
1820
|
tableOutputFormats: false,
|
|
1776
1821
|
tableMerging: false,
|
|
1777
1822
|
confidence: false,
|
|
1778
1823
|
// Datalab doesn't provide confidence scores
|
|
1779
1824
|
boundingBoxes: d.outputFormat?.features?.boundingBoxes ?? true,
|
|
1780
|
-
// Datalab provides
|
|
1825
|
+
// Datalab Surya provides text bboxes
|
|
1826
|
+
imageBoundingBoxes: isMarkerOCR || isMarkerVLM ? true : false,
|
|
1827
|
+
// Marker extracts images with bboxes
|
|
1781
1828
|
schemaValidation: isVLM,
|
|
1782
1829
|
// VLM providers support schema validation
|
|
1783
1830
|
handwrittenText: true,
|
|
1784
1831
|
// Datalab handles handwritten text
|
|
1785
1832
|
headerFooterExtraction: false,
|
|
1786
1833
|
// Datalab has issues with header/footer extraction
|
|
1834
|
+
// Extended features
|
|
1835
|
+
embedOptimized: false,
|
|
1836
|
+
passwordProtected: false,
|
|
1837
|
+
contentFiltering: false,
|
|
1838
|
+
ocrMode: false,
|
|
1839
|
+
webhookCallback: true,
|
|
1840
|
+
// Datalab supports webhook callbacks
|
|
1841
|
+
mediaResolution: false,
|
|
1842
|
+
changeTracking: true,
|
|
1843
|
+
// Datalab marker_extras supports track_changes
|
|
1844
|
+
hyperlinkExtraction: isMarkerOCR || isMarkerVLM,
|
|
1845
|
+
// Datalab extras=extract_links
|
|
1846
|
+
chartUnderstanding: isMarkerOCR || isMarkerVLM,
|
|
1847
|
+
// Datalab extras=chart_understanding
|
|
1848
|
+
imageCaptions: isMarkerOCR || isMarkerVLM,
|
|
1849
|
+
// Datalab disable_image_captions param
|
|
1850
|
+
signatureExtraction: false,
|
|
1851
|
+
commentExtraction: false,
|
|
1852
|
+
highlightExtraction: false,
|
|
1853
|
+
figureSummaries: false,
|
|
1787
1854
|
outputFormats
|
|
1788
1855
|
};
|
|
1789
1856
|
return {
|
|
@@ -1852,6 +1919,7 @@ function normalizeReductoProvider(id, d) {
|
|
|
1852
1919
|
const opts = d.supportedOptions ?? {};
|
|
1853
1920
|
const isVLM = d.type === "VLM";
|
|
1854
1921
|
const isExtract = d.compatibleNodes?.extract === true;
|
|
1922
|
+
const isParse = d.compatibleNodes?.parse === true;
|
|
1855
1923
|
const model = d.model ?? "v1";
|
|
1856
1924
|
const outputFormats = {
|
|
1857
1925
|
text: d.outputFormat?.features?.textLines ?? true,
|
|
@@ -1861,10 +1929,11 @@ function normalizeReductoProvider(id, d) {
|
|
|
1861
1929
|
json: d.outputFormat?.features?.structuredJSON ?? isExtract
|
|
1862
1930
|
};
|
|
1863
1931
|
const features = {
|
|
1864
|
-
maxPages: opts.
|
|
1932
|
+
maxPages: opts.pageRange ?? false ? "derived" : false,
|
|
1933
|
+
// SDK derives from pageRange (1-indexed)
|
|
1865
1934
|
pageRange: opts.pageRange ?? false,
|
|
1866
|
-
languageHints:
|
|
1867
|
-
// Reducto doesn't support
|
|
1935
|
+
languageHints: false,
|
|
1936
|
+
// Reducto doesn't support language hints
|
|
1868
1937
|
processingModes: false,
|
|
1869
1938
|
// Reducto uses agentic instead
|
|
1870
1939
|
agenticMode: opts.mode ?? false,
|
|
@@ -1887,14 +1956,44 @@ function normalizeReductoProvider(id, d) {
|
|
|
1887
1956
|
// Parse has mergeTables
|
|
1888
1957
|
confidence: opts.confidence ?? d.outputFormat?.features?.confidence ?? false,
|
|
1889
1958
|
// Reducto Parse has confidence
|
|
1890
|
-
boundingBoxes: d.outputFormat?.features?.boundingBoxes ??
|
|
1891
|
-
// Reducto Parse has bounding boxes
|
|
1959
|
+
boundingBoxes: d.outputFormat?.features?.boundingBoxes ?? isParse,
|
|
1960
|
+
// Reducto Parse has text bounding boxes
|
|
1961
|
+
imageBoundingBoxes: isParse ? true : false,
|
|
1962
|
+
// Reducto Parse has figure bounding boxes
|
|
1892
1963
|
schemaValidation: d.outputFormat?.features?.schemaValidation ?? isExtract,
|
|
1893
1964
|
// Extract has schema validation
|
|
1894
1965
|
handwrittenText: false,
|
|
1895
1966
|
// Reducto doesn't specifically advertise handwriting
|
|
1896
1967
|
headerFooterExtraction: true,
|
|
1897
1968
|
// Reducto has Header/Footer block types
|
|
1969
|
+
// Extended features
|
|
1970
|
+
embedOptimized: isParse,
|
|
1971
|
+
// Reducto Parse supports retrieval.embedding_optimized: true
|
|
1972
|
+
passwordProtected: true,
|
|
1973
|
+
// Reducto handles encrypted PDFs
|
|
1974
|
+
contentFiltering: true,
|
|
1975
|
+
// Reducto can filter block types
|
|
1976
|
+
ocrMode: opts.ocrSystem ?? false,
|
|
1977
|
+
// Reducto has ocr_system selection
|
|
1978
|
+
webhookCallback: true,
|
|
1979
|
+
// Reducto supports webhook callbacks
|
|
1980
|
+
mediaResolution: false,
|
|
1981
|
+
changeTracking: true,
|
|
1982
|
+
// Reducto tracks changes in Word docs
|
|
1983
|
+
hyperlinkExtraction: true,
|
|
1984
|
+
// Reducto extracts hyperlinks via formatting.include
|
|
1985
|
+
chartUnderstanding: isParse,
|
|
1986
|
+
// Reducto enhance.agentic[].advanced_chart_agent for figures
|
|
1987
|
+
imageCaptions: false,
|
|
1988
|
+
// Not available in Reducto
|
|
1989
|
+
signatureExtraction: false,
|
|
1990
|
+
// NOT supported - formatting.include only accepts: change_tracking, highlight, comments, hyperlinks
|
|
1991
|
+
commentExtraction: isParse || isExtract,
|
|
1992
|
+
// Reducto formatting.include: ["comments"]
|
|
1993
|
+
highlightExtraction: isParse || isExtract,
|
|
1994
|
+
// Reducto formatting.include: ["highlight"]
|
|
1995
|
+
figureSummaries: isParse,
|
|
1996
|
+
// Reducto enhance.summarize_figures
|
|
1898
1997
|
outputFormats
|
|
1899
1998
|
};
|
|
1900
1999
|
return {
|
|
@@ -1980,7 +2079,8 @@ function normalizeUnsiloedProvider(id, d) {
|
|
|
1980
2079
|
// Unsiloed doesn't have page range option
|
|
1981
2080
|
languageHints: false,
|
|
1982
2081
|
// Unsiloed doesn't support language hints
|
|
1983
|
-
processingModes:
|
|
2082
|
+
processingModes: false,
|
|
2083
|
+
// Unsiloed doesn't have fast/balanced/high_accuracy modes like Datalab
|
|
1984
2084
|
agenticMode: false,
|
|
1985
2085
|
// Unsiloed doesn't have agentic mode
|
|
1986
2086
|
customPrompts: false,
|
|
@@ -2002,14 +2102,40 @@ function normalizeUnsiloedProvider(id, d) {
|
|
|
2002
2102
|
tableMerging: false,
|
|
2003
2103
|
confidence: d.outputFormat?.features?.confidence ?? false,
|
|
2004
2104
|
// Unsiloed may provide confidence
|
|
2005
|
-
boundingBoxes: d.outputFormat?.features?.boundingBoxes ??
|
|
2006
|
-
// Unsiloed
|
|
2105
|
+
boundingBoxes: d.outputFormat?.features?.boundingBoxes ?? isParse,
|
|
2106
|
+
// Unsiloed Parse has bounding boxes
|
|
2107
|
+
imageBoundingBoxes: false,
|
|
2108
|
+
// Unsiloed doesn't return image-specific bboxes
|
|
2007
2109
|
schemaValidation: isExtract,
|
|
2008
2110
|
// Extract supports schema validation
|
|
2009
|
-
handwrittenText: false,
|
|
2010
|
-
//
|
|
2111
|
+
handwrittenText: d.capabilities?.specialFeatures?.includes("handwritten text") ?? false,
|
|
2112
|
+
// Parse supports handwriting
|
|
2011
2113
|
headerFooterExtraction: false,
|
|
2012
2114
|
// Unsiloed doesn't extract header/footer separately
|
|
2115
|
+
// Extended features
|
|
2116
|
+
embedOptimized: false,
|
|
2117
|
+
passwordProtected: false,
|
|
2118
|
+
contentFiltering: isParse,
|
|
2119
|
+
// Parse supports keep_segment_types: ["table", "picture", "formula", "text"]
|
|
2120
|
+
ocrMode: isParse,
|
|
2121
|
+
// Parse endpoint supports ocr_mode: 'auto_ocr' | 'full_ocr'
|
|
2122
|
+
webhookCallback: false,
|
|
2123
|
+
// Unsiloed is synchronous
|
|
2124
|
+
mediaResolution: false,
|
|
2125
|
+
changeTracking: false,
|
|
2126
|
+
hyperlinkExtraction: false,
|
|
2127
|
+
chartUnderstanding: false,
|
|
2128
|
+
// Not available in Unsiloed
|
|
2129
|
+
imageCaptions: false,
|
|
2130
|
+
// Not available in Unsiloed
|
|
2131
|
+
signatureExtraction: false,
|
|
2132
|
+
// Not available in Unsiloed
|
|
2133
|
+
commentExtraction: false,
|
|
2134
|
+
// Not available in Unsiloed
|
|
2135
|
+
highlightExtraction: false,
|
|
2136
|
+
// Not available in Unsiloed
|
|
2137
|
+
figureSummaries: false,
|
|
2138
|
+
// Not available in Unsiloed
|
|
2013
2139
|
outputFormats
|
|
2014
2140
|
};
|
|
2015
2141
|
return {
|
|
@@ -2038,7 +2164,8 @@ function normalizeUnsiloedProvider(id, d) {
|
|
|
2038
2164
|
supportsImageExtraction: false,
|
|
2039
2165
|
supportsPageMarkers: false,
|
|
2040
2166
|
supportsLanguageHints: false,
|
|
2041
|
-
supportsProcessingModes:
|
|
2167
|
+
supportsProcessingModes: false,
|
|
2168
|
+
// Unsiloed doesn't have fast/balanced/high_accuracy modes
|
|
2042
2169
|
supportsSegmentation: isSplit || isCategorize,
|
|
2043
2170
|
outputFormats
|
|
2044
2171
|
},
|
|
@@ -2256,7 +2383,7 @@ function matchesModelFilter(model, filter) {
|
|
|
2256
2383
|
}
|
|
2257
2384
|
if (filter.hasFeatures && filter.hasFeatures.length > 0) {
|
|
2258
2385
|
for (const feature of filter.hasFeatures) {
|
|
2259
|
-
if (model.features[feature]
|
|
2386
|
+
if (!isFeatureEnabled(model.features[feature])) {
|
|
2260
2387
|
return false;
|
|
2261
2388
|
}
|
|
2262
2389
|
}
|
|
@@ -2299,6 +2426,44 @@ function getAllModels() {
|
|
|
2299
2426
|
function clearModelRegistry() {
|
|
2300
2427
|
modelRegistry.clear();
|
|
2301
2428
|
}
|
|
2429
|
+
var PAGE_INDEXING = {
|
|
2430
|
+
datalab: "0-indexed",
|
|
2431
|
+
reducto: "1-indexed",
|
|
2432
|
+
mistral: "0-indexed",
|
|
2433
|
+
unsiloed: "1-indexed",
|
|
2434
|
+
// Default assumption
|
|
2435
|
+
llm: "1-indexed"
|
|
2436
|
+
// N/A but default
|
|
2437
|
+
};
|
|
2438
|
+
function getPageIndexing(provider) {
|
|
2439
|
+
const source = typeof provider === "string" ? provider : provider.source;
|
|
2440
|
+
return PAGE_INDEXING[source] ?? "1-indexed";
|
|
2441
|
+
}
|
|
2442
|
+
function transformDerivedFeatures(options, provider) {
|
|
2443
|
+
const { maxPages, pageRange, ...remainingOptions } = options;
|
|
2444
|
+
const result = { remainingOptions };
|
|
2445
|
+
if (pageRange !== void 0) {
|
|
2446
|
+
result.page_range = pageRange;
|
|
2447
|
+
return result;
|
|
2448
|
+
}
|
|
2449
|
+
if (maxPages !== void 0 && provider.features.maxPages === "derived") {
|
|
2450
|
+
const indexing = getPageIndexing(provider);
|
|
2451
|
+
if (indexing === "0-indexed") {
|
|
2452
|
+
result.page_range = `0-${maxPages - 1}`;
|
|
2453
|
+
if (provider.source === "mistral") {
|
|
2454
|
+
result.pages = Array.from({ length: maxPages }, (_, i) => i);
|
|
2455
|
+
}
|
|
2456
|
+
} else {
|
|
2457
|
+
result.page_range = `1-${maxPages}`;
|
|
2458
|
+
}
|
|
2459
|
+
} else if (maxPages !== void 0 && isFeatureEnabled(provider.features.maxPages)) {
|
|
2460
|
+
result.remainingOptions.maxPages = maxPages;
|
|
2461
|
+
}
|
|
2462
|
+
return result;
|
|
2463
|
+
}
|
|
2464
|
+
function requiresMaxPagesTransformation(provider) {
|
|
2465
|
+
return provider.features.maxPages === "derived";
|
|
2466
|
+
}
|
|
2302
2467
|
function normalizeMistralProvider(id, d) {
|
|
2303
2468
|
const opts = d.supportedOptions ?? {};
|
|
2304
2469
|
const isVLM = d.type === "VLM";
|
|
@@ -2314,7 +2479,7 @@ function normalizeMistralProvider(id, d) {
|
|
|
2314
2479
|
const features = {
|
|
2315
2480
|
maxPages: d.inputFormats?.maxPages !== void 0,
|
|
2316
2481
|
pageRange: true,
|
|
2317
|
-
// Mistral supports pages param: "0-5" or [0,2,5]
|
|
2482
|
+
// Mistral supports pages param: "0-5" or [0,2,5] (0-indexed)
|
|
2318
2483
|
languageHints: false,
|
|
2319
2484
|
// Mistral doesn't support language hints
|
|
2320
2485
|
processingModes: false,
|
|
@@ -2342,14 +2507,39 @@ function normalizeMistralProvider(id, d) {
|
|
|
2342
2507
|
tableMerging: false,
|
|
2343
2508
|
confidence: false,
|
|
2344
2509
|
// Mistral doesn't provide confidence scores
|
|
2345
|
-
boundingBoxes:
|
|
2346
|
-
//
|
|
2510
|
+
boundingBoxes: false,
|
|
2511
|
+
// Mistral does NOT provide text-level bounding boxes
|
|
2512
|
+
imageBoundingBoxes: true,
|
|
2513
|
+
// Mistral provides image/figure bounding boxes only
|
|
2347
2514
|
schemaValidation: d.outputFormat?.features?.schemaValidation ?? isVLM,
|
|
2348
2515
|
// VLM supports schema
|
|
2349
2516
|
handwrittenText: d.outputFormat?.features?.handwrittenText ?? true,
|
|
2350
2517
|
// Excellent handwriting support
|
|
2351
2518
|
headerFooterExtraction: opts.extractHeader ?? opts.extractFooter ?? false,
|
|
2352
2519
|
// extract_header/extract_footer
|
|
2520
|
+
// Extended features
|
|
2521
|
+
embedOptimized: false,
|
|
2522
|
+
passwordProtected: false,
|
|
2523
|
+
contentFiltering: false,
|
|
2524
|
+
ocrMode: false,
|
|
2525
|
+
webhookCallback: false,
|
|
2526
|
+
// Mistral is synchronous
|
|
2527
|
+
mediaResolution: false,
|
|
2528
|
+
changeTracking: false,
|
|
2529
|
+
hyperlinkExtraction: true,
|
|
2530
|
+
// Response pages[].hyperlinks[] auto-extracted
|
|
2531
|
+
chartUnderstanding: false,
|
|
2532
|
+
// Not available as separate feature in Mistral
|
|
2533
|
+
imageCaptions: false,
|
|
2534
|
+
// Not available in Mistral
|
|
2535
|
+
signatureExtraction: false,
|
|
2536
|
+
// Not available in Mistral
|
|
2537
|
+
commentExtraction: false,
|
|
2538
|
+
// Not available in Mistral
|
|
2539
|
+
highlightExtraction: false,
|
|
2540
|
+
// Not available in Mistral
|
|
2541
|
+
figureSummaries: false,
|
|
2542
|
+
// Not available in Mistral
|
|
2353
2543
|
outputFormats
|
|
2354
2544
|
};
|
|
2355
2545
|
return {
|
|
@@ -2366,8 +2556,8 @@ function normalizeMistralProvider(id, d) {
|
|
|
2366
2556
|
capabilities: {
|
|
2367
2557
|
supportsImages: d.capabilities?.supportsImages ?? true,
|
|
2368
2558
|
supportsPDFs: d.capabilities?.supportsPDFs ?? true,
|
|
2369
|
-
supportsDocuments: d.capabilities?.supportsDocuments ??
|
|
2370
|
-
// DOCX
|
|
2559
|
+
supportsDocuments: d.capabilities?.supportsDocuments ?? true,
|
|
2560
|
+
// Supports DOCX, PPTX, TXT, EPUB, RTF, ODT, etc. (NOT XLSX)
|
|
2371
2561
|
supportsReasoning: false,
|
|
2372
2562
|
// OCR 3 doesn't do reasoning
|
|
2373
2563
|
supportsStructuredOutput: d.capabilities?.supportsStructuredOutput ?? isVLM,
|
|
@@ -2635,6 +2825,7 @@ export {
|
|
|
2635
2825
|
getNodeTypeName,
|
|
2636
2826
|
getPDFPageCount,
|
|
2637
2827
|
getPageCountMetadata,
|
|
2828
|
+
getPageIndexing,
|
|
2638
2829
|
getProviderById,
|
|
2639
2830
|
getProvidersBySource,
|
|
2640
2831
|
getProvidersForLargeFiles,
|
|
@@ -2642,6 +2833,7 @@ export {
|
|
|
2642
2833
|
getSuggestedConnections,
|
|
2643
2834
|
getTotalPageCount,
|
|
2644
2835
|
getValidForEachStarters,
|
|
2836
|
+
isFeatureEnabled,
|
|
2645
2837
|
isLocalEndpoint,
|
|
2646
2838
|
isPDFDocument,
|
|
2647
2839
|
isRetryableError,
|
|
@@ -2653,11 +2845,13 @@ export {
|
|
|
2653
2845
|
queryProviders,
|
|
2654
2846
|
registerProviderMetadata,
|
|
2655
2847
|
registerProviderWithModels,
|
|
2848
|
+
requiresMaxPagesTransformation,
|
|
2656
2849
|
resolveDocument,
|
|
2657
2850
|
resolveModelMetadata,
|
|
2658
2851
|
runPipeline,
|
|
2659
2852
|
splitPDFIntoChunks,
|
|
2660
2853
|
toProviderString,
|
|
2854
|
+
transformDerivedFeatures,
|
|
2661
2855
|
validateFlowInputFormat,
|
|
2662
2856
|
validateJson,
|
|
2663
2857
|
validateMimeType,
|