@doclo/core 0.1.11 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +131 -25
- package/dist/index.js +351 -29
- package/dist/index.js.map +1 -1
- package/dist/internal/validation-utils.d.ts +1 -1
- package/dist/pdf-utils.d.ts +1 -1
- package/dist/{validation-D_EcHqPl.d.ts → validation-wlK06puw.d.ts} +1 -1
- package/dist/validation.d.ts +1 -1
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -1398,6 +1398,9 @@ function createIdentity(provider, model, opts) {
|
|
|
1398
1398
|
}
|
|
1399
1399
|
|
|
1400
1400
|
// src/provider-query.ts
|
|
1401
|
+
function isFeatureEnabled(status) {
|
|
1402
|
+
return status === true || status === "deprecated" || status === "derived";
|
|
1403
|
+
}
|
|
1401
1404
|
var providerRegistry = /* @__PURE__ */ new Map();
|
|
1402
1405
|
function registerProviderMetadata(source, metadata, normalizer) {
|
|
1403
1406
|
const normalized = /* @__PURE__ */ new Map();
|
|
@@ -1482,7 +1485,7 @@ function queryProviders(filter = {}) {
|
|
|
1482
1485
|
}
|
|
1483
1486
|
if (filter.hasFeatures && filter.hasFeatures.length > 0) {
|
|
1484
1487
|
providers = providers.filter(
|
|
1485
|
-
(p) => filter.hasFeatures.every((feature) => p.features[feature]
|
|
1488
|
+
(p) => filter.hasFeatures.every((feature) => isFeatureEnabled(p.features[feature]))
|
|
1486
1489
|
);
|
|
1487
1490
|
}
|
|
1488
1491
|
if (filter.outputFormat) {
|
|
@@ -1558,6 +1561,8 @@ function defaultNormalizer(id, data, source) {
|
|
|
1558
1561
|
return normalizeReductoProvider(id, d);
|
|
1559
1562
|
} else if (source === "unsiloed") {
|
|
1560
1563
|
return normalizeUnsiloedProvider(id, d);
|
|
1564
|
+
} else if (source === "mistral") {
|
|
1565
|
+
return normalizeMistralProvider(id, d);
|
|
1561
1566
|
}
|
|
1562
1567
|
const defaultOutputFormats = { text: true, markdown: false, html: false, json: false };
|
|
1563
1568
|
const defaultFeatures = {
|
|
@@ -1579,8 +1584,25 @@ function defaultNormalizer(id, data, source) {
|
|
|
1579
1584
|
tableMerging: false,
|
|
1580
1585
|
confidence: false,
|
|
1581
1586
|
boundingBoxes: false,
|
|
1587
|
+
imageBoundingBoxes: false,
|
|
1582
1588
|
schemaValidation: false,
|
|
1583
1589
|
handwrittenText: false,
|
|
1590
|
+
headerFooterExtraction: false,
|
|
1591
|
+
// Extended features
|
|
1592
|
+
embedOptimized: false,
|
|
1593
|
+
passwordProtected: false,
|
|
1594
|
+
contentFiltering: false,
|
|
1595
|
+
ocrMode: false,
|
|
1596
|
+
webhookCallback: false,
|
|
1597
|
+
mediaResolution: false,
|
|
1598
|
+
changeTracking: false,
|
|
1599
|
+
hyperlinkExtraction: false,
|
|
1600
|
+
chartUnderstanding: false,
|
|
1601
|
+
imageCaptions: false,
|
|
1602
|
+
signatureExtraction: false,
|
|
1603
|
+
commentExtraction: false,
|
|
1604
|
+
highlightExtraction: false,
|
|
1605
|
+
figureSummaries: false,
|
|
1584
1606
|
outputFormats: defaultOutputFormats
|
|
1585
1607
|
};
|
|
1586
1608
|
return {
|
|
@@ -1635,10 +1657,12 @@ function normalizeLLMProvider(id, d) {
|
|
|
1635
1657
|
html: true,
|
|
1636
1658
|
json: d.capabilities?.supportsStructuredOutput ?? true
|
|
1637
1659
|
};
|
|
1660
|
+
const vendor = d.vendor ?? id;
|
|
1638
1661
|
const features = {
|
|
1639
|
-
maxPages:
|
|
1640
|
-
|
|
1641
|
-
|
|
1662
|
+
maxPages: "derived",
|
|
1663
|
+
// SDK can limit via pre-processing
|
|
1664
|
+
pageRange: false,
|
|
1665
|
+
// No native API support - LLMs receive full text
|
|
1642
1666
|
languageHints: false,
|
|
1643
1667
|
// Not applicable to LLMs
|
|
1644
1668
|
processingModes: false,
|
|
@@ -1651,8 +1675,8 @@ function normalizeLLMProvider(id, d) {
|
|
|
1651
1675
|
// LLMs don't extract images
|
|
1652
1676
|
pageMarkers: false,
|
|
1653
1677
|
// LLMs don't add page markers
|
|
1654
|
-
citations: false,
|
|
1655
|
-
//
|
|
1678
|
+
citations: vendor === "anthropic" ? true : false,
|
|
1679
|
+
// Anthropic has Citations API
|
|
1656
1680
|
chunking: false,
|
|
1657
1681
|
// LLMs don't do chunking
|
|
1658
1682
|
segmentation: false,
|
|
@@ -1666,13 +1690,32 @@ function normalizeLLMProvider(id, d) {
|
|
|
1666
1690
|
// LLMs don't provide confidence scores
|
|
1667
1691
|
boundingBoxes: false,
|
|
1668
1692
|
// LLMs don't provide bounding boxes
|
|
1693
|
+
imageBoundingBoxes: false,
|
|
1694
|
+
// LLMs don't provide image bounding boxes (Gemini 2.0+ can via specific prompting, but not a simple toggle)
|
|
1669
1695
|
schemaValidation: d.capabilities?.supportsStructuredOutput ?? false,
|
|
1670
1696
|
// Some LLMs support schema validation
|
|
1671
1697
|
handwrittenText: false,
|
|
1672
1698
|
// Not specific to LLMs
|
|
1699
|
+
headerFooterExtraction: false,
|
|
1700
|
+
// LLMs don't extract header/footer separately
|
|
1701
|
+
// Extended features
|
|
1702
|
+
embedOptimized: false,
|
|
1703
|
+
passwordProtected: false,
|
|
1704
|
+
contentFiltering: false,
|
|
1705
|
+
ocrMode: false,
|
|
1706
|
+
webhookCallback: false,
|
|
1707
|
+
mediaResolution: vendor === "google" ? true : false,
|
|
1708
|
+
// Google Gemini has mediaResolution
|
|
1709
|
+
changeTracking: false,
|
|
1710
|
+
hyperlinkExtraction: false,
|
|
1711
|
+
chartUnderstanding: false,
|
|
1712
|
+
imageCaptions: false,
|
|
1713
|
+
signatureExtraction: false,
|
|
1714
|
+
commentExtraction: false,
|
|
1715
|
+
highlightExtraction: false,
|
|
1716
|
+
figureSummaries: false,
|
|
1673
1717
|
outputFormats
|
|
1674
1718
|
};
|
|
1675
|
-
const vendor = d.vendor ?? id;
|
|
1676
1719
|
return {
|
|
1677
1720
|
id,
|
|
1678
1721
|
name: d.name ?? id,
|
|
@@ -1693,7 +1736,8 @@ function normalizeLLMProvider(id, d) {
|
|
|
1693
1736
|
supportsStructuredOutput: d.capabilities?.supportsStructuredOutput ?? false,
|
|
1694
1737
|
// NEW capabilities
|
|
1695
1738
|
supportsPrompts: true,
|
|
1696
|
-
supportsCitations:
|
|
1739
|
+
supportsCitations: vendor === "anthropic",
|
|
1740
|
+
// Anthropic has Citations API
|
|
1697
1741
|
supportsChunking: false,
|
|
1698
1742
|
supportsImageExtraction: false,
|
|
1699
1743
|
supportsPageMarkers: false,
|
|
@@ -1740,6 +1784,8 @@ function normalizeLLMProvider(id, d) {
|
|
|
1740
1784
|
function normalizeDatalabProvider(id, d) {
|
|
1741
1785
|
const opts = d.supportedOptions ?? {};
|
|
1742
1786
|
const isVLM = d.type === "VLM";
|
|
1787
|
+
const isMarkerOCR = id === "marker-ocr" || id.includes("marker-ocr");
|
|
1788
|
+
const isMarkerVLM = id === "marker-vlm" || id.includes("marker-vlm");
|
|
1743
1789
|
const model = d.model ?? id;
|
|
1744
1790
|
const outputFormats = {
|
|
1745
1791
|
text: true,
|
|
@@ -1750,33 +1796,61 @@ function normalizeDatalabProvider(id, d) {
|
|
|
1750
1796
|
const features = {
|
|
1751
1797
|
maxPages: opts.maxPages ?? false,
|
|
1752
1798
|
pageRange: opts.pageRange ?? false,
|
|
1753
|
-
languageHints: opts.langs
|
|
1754
|
-
//
|
|
1799
|
+
languageHints: opts.langs ? "deprecated" : false,
|
|
1800
|
+
// API ignores, handled automatically
|
|
1755
1801
|
processingModes: opts.mode ?? false,
|
|
1756
1802
|
agenticMode: false,
|
|
1757
1803
|
// Datalab doesn't have agentic mode
|
|
1758
|
-
customPrompts: opts.blockCorrectionPrompt
|
|
1804
|
+
customPrompts: opts.blockCorrectionPrompt ? "deprecated" : false,
|
|
1805
|
+
// Not currently supported
|
|
1759
1806
|
imageExtraction: opts.extractImages ?? false,
|
|
1760
1807
|
pageMarkers: opts.paginate ?? false,
|
|
1761
1808
|
// maps from 'paginate'
|
|
1762
|
-
citations:
|
|
1809
|
+
citations: isMarkerVLM ? true : false,
|
|
1810
|
+
// Marker VLM has citations
|
|
1763
1811
|
chunking: false,
|
|
1764
1812
|
// Datalab doesn't have chunking
|
|
1765
1813
|
segmentation: opts.segmentation ?? false,
|
|
1766
|
-
stripExistingOCR: opts.stripExistingOCR
|
|
1767
|
-
|
|
1768
|
-
|
|
1769
|
-
//
|
|
1814
|
+
stripExistingOCR: opts.stripExistingOCR ? "deprecated" : false,
|
|
1815
|
+
// Managed automatically
|
|
1816
|
+
formatLines: opts.formatLines ? "deprecated" : false,
|
|
1817
|
+
// Handled automatically
|
|
1818
|
+
forceOCR: "deprecated",
|
|
1819
|
+
// DEPRECATED: force_ocr param has no effect per API docs
|
|
1770
1820
|
tableOutputFormats: false,
|
|
1771
1821
|
tableMerging: false,
|
|
1772
1822
|
confidence: false,
|
|
1773
1823
|
// Datalab doesn't provide confidence scores
|
|
1774
1824
|
boundingBoxes: d.outputFormat?.features?.boundingBoxes ?? true,
|
|
1775
|
-
// Datalab provides
|
|
1825
|
+
// Datalab Surya provides text bboxes
|
|
1826
|
+
imageBoundingBoxes: isMarkerOCR || isMarkerVLM ? true : false,
|
|
1827
|
+
// Marker extracts images with bboxes
|
|
1776
1828
|
schemaValidation: isVLM,
|
|
1777
1829
|
// VLM providers support schema validation
|
|
1778
1830
|
handwrittenText: true,
|
|
1779
1831
|
// Datalab handles handwritten text
|
|
1832
|
+
headerFooterExtraction: false,
|
|
1833
|
+
// Datalab has issues with header/footer extraction
|
|
1834
|
+
// Extended features
|
|
1835
|
+
embedOptimized: false,
|
|
1836
|
+
passwordProtected: false,
|
|
1837
|
+
contentFiltering: false,
|
|
1838
|
+
ocrMode: false,
|
|
1839
|
+
webhookCallback: true,
|
|
1840
|
+
// Datalab supports webhook callbacks
|
|
1841
|
+
mediaResolution: false,
|
|
1842
|
+
changeTracking: true,
|
|
1843
|
+
// Datalab marker_extras supports track_changes
|
|
1844
|
+
hyperlinkExtraction: isMarkerOCR || isMarkerVLM,
|
|
1845
|
+
// Datalab extras=extract_links
|
|
1846
|
+
chartUnderstanding: isMarkerOCR || isMarkerVLM,
|
|
1847
|
+
// Datalab extras=chart_understanding
|
|
1848
|
+
imageCaptions: isMarkerOCR || isMarkerVLM,
|
|
1849
|
+
// Datalab disable_image_captions param
|
|
1850
|
+
signatureExtraction: false,
|
|
1851
|
+
commentExtraction: false,
|
|
1852
|
+
highlightExtraction: false,
|
|
1853
|
+
figureSummaries: false,
|
|
1780
1854
|
outputFormats
|
|
1781
1855
|
};
|
|
1782
1856
|
return {
|
|
@@ -1845,6 +1919,7 @@ function normalizeReductoProvider(id, d) {
|
|
|
1845
1919
|
const opts = d.supportedOptions ?? {};
|
|
1846
1920
|
const isVLM = d.type === "VLM";
|
|
1847
1921
|
const isExtract = d.compatibleNodes?.extract === true;
|
|
1922
|
+
const isParse = d.compatibleNodes?.parse === true;
|
|
1848
1923
|
const model = d.model ?? "v1";
|
|
1849
1924
|
const outputFormats = {
|
|
1850
1925
|
text: d.outputFormat?.features?.textLines ?? true,
|
|
@@ -1854,10 +1929,11 @@ function normalizeReductoProvider(id, d) {
|
|
|
1854
1929
|
json: d.outputFormat?.features?.structuredJSON ?? isExtract
|
|
1855
1930
|
};
|
|
1856
1931
|
const features = {
|
|
1857
|
-
maxPages: opts.
|
|
1932
|
+
maxPages: opts.pageRange ?? false ? "derived" : false,
|
|
1933
|
+
// SDK derives from pageRange (1-indexed)
|
|
1858
1934
|
pageRange: opts.pageRange ?? false,
|
|
1859
|
-
languageHints:
|
|
1860
|
-
// Reducto doesn't support
|
|
1935
|
+
languageHints: false,
|
|
1936
|
+
// Reducto doesn't support language hints
|
|
1861
1937
|
processingModes: false,
|
|
1862
1938
|
// Reducto uses agentic instead
|
|
1863
1939
|
agenticMode: opts.mode ?? false,
|
|
@@ -1880,12 +1956,44 @@ function normalizeReductoProvider(id, d) {
|
|
|
1880
1956
|
// Parse has mergeTables
|
|
1881
1957
|
confidence: opts.confidence ?? d.outputFormat?.features?.confidence ?? false,
|
|
1882
1958
|
// Reducto Parse has confidence
|
|
1883
|
-
boundingBoxes: d.outputFormat?.features?.boundingBoxes ??
|
|
1884
|
-
// Reducto Parse has bounding boxes
|
|
1959
|
+
boundingBoxes: d.outputFormat?.features?.boundingBoxes ?? isParse,
|
|
1960
|
+
// Reducto Parse has text bounding boxes
|
|
1961
|
+
imageBoundingBoxes: isParse ? true : false,
|
|
1962
|
+
// Reducto Parse has figure bounding boxes
|
|
1885
1963
|
schemaValidation: d.outputFormat?.features?.schemaValidation ?? isExtract,
|
|
1886
1964
|
// Extract has schema validation
|
|
1887
1965
|
handwrittenText: false,
|
|
1888
1966
|
// Reducto doesn't specifically advertise handwriting
|
|
1967
|
+
headerFooterExtraction: true,
|
|
1968
|
+
// Reducto has Header/Footer block types
|
|
1969
|
+
// Extended features
|
|
1970
|
+
embedOptimized: isParse,
|
|
1971
|
+
// Reducto Parse supports retrieval.embedding_optimized: true
|
|
1972
|
+
passwordProtected: true,
|
|
1973
|
+
// Reducto handles encrypted PDFs
|
|
1974
|
+
contentFiltering: true,
|
|
1975
|
+
// Reducto can filter block types
|
|
1976
|
+
ocrMode: opts.ocrSystem ?? false,
|
|
1977
|
+
// Reducto has ocr_system selection
|
|
1978
|
+
webhookCallback: true,
|
|
1979
|
+
// Reducto supports webhook callbacks
|
|
1980
|
+
mediaResolution: false,
|
|
1981
|
+
changeTracking: true,
|
|
1982
|
+
// Reducto tracks changes in Word docs
|
|
1983
|
+
hyperlinkExtraction: true,
|
|
1984
|
+
// Reducto extracts hyperlinks via formatting.include
|
|
1985
|
+
chartUnderstanding: isParse,
|
|
1986
|
+
// Reducto enhance.agentic[].advanced_chart_agent for figures
|
|
1987
|
+
imageCaptions: false,
|
|
1988
|
+
// Not available in Reducto
|
|
1989
|
+
signatureExtraction: false,
|
|
1990
|
+
// NOT supported - formatting.include only accepts: change_tracking, highlight, comments, hyperlinks
|
|
1991
|
+
commentExtraction: isParse || isExtract,
|
|
1992
|
+
// Reducto formatting.include: ["comments"]
|
|
1993
|
+
highlightExtraction: isParse || isExtract,
|
|
1994
|
+
// Reducto formatting.include: ["highlight"]
|
|
1995
|
+
figureSummaries: isParse,
|
|
1996
|
+
// Reducto enhance.summarize_figures
|
|
1889
1997
|
outputFormats
|
|
1890
1998
|
};
|
|
1891
1999
|
return {
|
|
@@ -1971,7 +2079,8 @@ function normalizeUnsiloedProvider(id, d) {
|
|
|
1971
2079
|
// Unsiloed doesn't have page range option
|
|
1972
2080
|
languageHints: false,
|
|
1973
2081
|
// Unsiloed doesn't support language hints
|
|
1974
|
-
processingModes:
|
|
2082
|
+
processingModes: false,
|
|
2083
|
+
// Unsiloed doesn't have fast/balanced/high_accuracy modes like Datalab
|
|
1975
2084
|
agenticMode: false,
|
|
1976
2085
|
// Unsiloed doesn't have agentic mode
|
|
1977
2086
|
customPrompts: false,
|
|
@@ -1993,12 +2102,40 @@ function normalizeUnsiloedProvider(id, d) {
|
|
|
1993
2102
|
tableMerging: false,
|
|
1994
2103
|
confidence: d.outputFormat?.features?.confidence ?? false,
|
|
1995
2104
|
// Unsiloed may provide confidence
|
|
1996
|
-
boundingBoxes: d.outputFormat?.features?.boundingBoxes ??
|
|
1997
|
-
// Unsiloed
|
|
2105
|
+
boundingBoxes: d.outputFormat?.features?.boundingBoxes ?? isParse,
|
|
2106
|
+
// Unsiloed Parse has bounding boxes
|
|
2107
|
+
imageBoundingBoxes: false,
|
|
2108
|
+
// Unsiloed doesn't return image-specific bboxes
|
|
1998
2109
|
schemaValidation: isExtract,
|
|
1999
2110
|
// Extract supports schema validation
|
|
2000
|
-
handwrittenText: false,
|
|
2001
|
-
//
|
|
2111
|
+
handwrittenText: d.capabilities?.specialFeatures?.includes("handwritten text") ?? false,
|
|
2112
|
+
// Parse supports handwriting
|
|
2113
|
+
headerFooterExtraction: false,
|
|
2114
|
+
// Unsiloed doesn't extract header/footer separately
|
|
2115
|
+
// Extended features
|
|
2116
|
+
embedOptimized: false,
|
|
2117
|
+
passwordProtected: false,
|
|
2118
|
+
contentFiltering: isParse,
|
|
2119
|
+
// Parse supports keep_segment_types: ["table", "picture", "formula", "text"]
|
|
2120
|
+
ocrMode: isParse,
|
|
2121
|
+
// Parse endpoint supports ocr_mode: 'auto_ocr' | 'full_ocr'
|
|
2122
|
+
webhookCallback: false,
|
|
2123
|
+
// Unsiloed is synchronous
|
|
2124
|
+
mediaResolution: false,
|
|
2125
|
+
changeTracking: false,
|
|
2126
|
+
hyperlinkExtraction: false,
|
|
2127
|
+
chartUnderstanding: false,
|
|
2128
|
+
// Not available in Unsiloed
|
|
2129
|
+
imageCaptions: false,
|
|
2130
|
+
// Not available in Unsiloed
|
|
2131
|
+
signatureExtraction: false,
|
|
2132
|
+
// Not available in Unsiloed
|
|
2133
|
+
commentExtraction: false,
|
|
2134
|
+
// Not available in Unsiloed
|
|
2135
|
+
highlightExtraction: false,
|
|
2136
|
+
// Not available in Unsiloed
|
|
2137
|
+
figureSummaries: false,
|
|
2138
|
+
// Not available in Unsiloed
|
|
2002
2139
|
outputFormats
|
|
2003
2140
|
};
|
|
2004
2141
|
return {
|
|
@@ -2027,7 +2164,8 @@ function normalizeUnsiloedProvider(id, d) {
|
|
|
2027
2164
|
supportsImageExtraction: false,
|
|
2028
2165
|
supportsPageMarkers: false,
|
|
2029
2166
|
supportsLanguageHints: false,
|
|
2030
|
-
supportsProcessingModes:
|
|
2167
|
+
supportsProcessingModes: false,
|
|
2168
|
+
// Unsiloed doesn't have fast/balanced/high_accuracy modes
|
|
2031
2169
|
supportsSegmentation: isSplit || isCategorize,
|
|
2032
2170
|
outputFormats
|
|
2033
2171
|
},
|
|
@@ -2245,7 +2383,7 @@ function matchesModelFilter(model, filter) {
|
|
|
2245
2383
|
}
|
|
2246
2384
|
if (filter.hasFeatures && filter.hasFeatures.length > 0) {
|
|
2247
2385
|
for (const feature of filter.hasFeatures) {
|
|
2248
|
-
if (model.features[feature]
|
|
2386
|
+
if (!isFeatureEnabled(model.features[feature])) {
|
|
2249
2387
|
return false;
|
|
2250
2388
|
}
|
|
2251
2389
|
}
|
|
@@ -2288,6 +2426,186 @@ function getAllModels() {
|
|
|
2288
2426
|
function clearModelRegistry() {
|
|
2289
2427
|
modelRegistry.clear();
|
|
2290
2428
|
}
|
|
2429
|
+
var PAGE_INDEXING = {
|
|
2430
|
+
datalab: "0-indexed",
|
|
2431
|
+
reducto: "1-indexed",
|
|
2432
|
+
mistral: "0-indexed",
|
|
2433
|
+
unsiloed: "1-indexed",
|
|
2434
|
+
// Default assumption
|
|
2435
|
+
llm: "1-indexed"
|
|
2436
|
+
// N/A but default
|
|
2437
|
+
};
|
|
2438
|
+
function getPageIndexing(provider) {
|
|
2439
|
+
const source = typeof provider === "string" ? provider : provider.source;
|
|
2440
|
+
return PAGE_INDEXING[source] ?? "1-indexed";
|
|
2441
|
+
}
|
|
2442
|
+
function transformDerivedFeatures(options, provider) {
|
|
2443
|
+
const { maxPages, pageRange, ...remainingOptions } = options;
|
|
2444
|
+
const result = { remainingOptions };
|
|
2445
|
+
if (pageRange !== void 0) {
|
|
2446
|
+
result.page_range = pageRange;
|
|
2447
|
+
return result;
|
|
2448
|
+
}
|
|
2449
|
+
if (maxPages !== void 0 && provider.features.maxPages === "derived") {
|
|
2450
|
+
const indexing = getPageIndexing(provider);
|
|
2451
|
+
if (indexing === "0-indexed") {
|
|
2452
|
+
result.page_range = `0-${maxPages - 1}`;
|
|
2453
|
+
if (provider.source === "mistral") {
|
|
2454
|
+
result.pages = Array.from({ length: maxPages }, (_, i) => i);
|
|
2455
|
+
}
|
|
2456
|
+
} else {
|
|
2457
|
+
result.page_range = `1-${maxPages}`;
|
|
2458
|
+
}
|
|
2459
|
+
} else if (maxPages !== void 0 && isFeatureEnabled(provider.features.maxPages)) {
|
|
2460
|
+
result.remainingOptions.maxPages = maxPages;
|
|
2461
|
+
}
|
|
2462
|
+
return result;
|
|
2463
|
+
}
|
|
2464
|
+
function requiresMaxPagesTransformation(provider) {
|
|
2465
|
+
return provider.features.maxPages === "derived";
|
|
2466
|
+
}
|
|
2467
|
+
function normalizeMistralProvider(id, d) {
|
|
2468
|
+
const opts = d.supportedOptions ?? {};
|
|
2469
|
+
const isVLM = d.type === "VLM";
|
|
2470
|
+
const isOCR = d.type === "OCR";
|
|
2471
|
+
const model = d.model ?? id;
|
|
2472
|
+
const outputFormats = {
|
|
2473
|
+
text: true,
|
|
2474
|
+
markdown: d.outputFormat?.features?.markdown ?? isOCR,
|
|
2475
|
+
html: d.outputFormat?.features?.htmlTables ?? isOCR,
|
|
2476
|
+
// OCR 3 can output HTML tables
|
|
2477
|
+
json: d.outputFormat?.features?.structuredJSON ?? isVLM
|
|
2478
|
+
};
|
|
2479
|
+
const features = {
|
|
2480
|
+
maxPages: d.inputFormats?.maxPages !== void 0,
|
|
2481
|
+
pageRange: true,
|
|
2482
|
+
// Mistral supports pages param: "0-5" or [0,2,5] (0-indexed)
|
|
2483
|
+
languageHints: false,
|
|
2484
|
+
// Mistral doesn't support language hints
|
|
2485
|
+
processingModes: false,
|
|
2486
|
+
// Mistral doesn't have processing modes
|
|
2487
|
+
agenticMode: false,
|
|
2488
|
+
// Mistral doesn't have agentic mode
|
|
2489
|
+
customPrompts: false,
|
|
2490
|
+
// Mistral OCR 3 doesn't support custom prompts
|
|
2491
|
+
imageExtraction: opts.includeImageBase64 ?? false,
|
|
2492
|
+
// Can include embedded images
|
|
2493
|
+
pageMarkers: false,
|
|
2494
|
+
// Mistral doesn't add page markers
|
|
2495
|
+
citations: false,
|
|
2496
|
+
// Mistral doesn't provide citations
|
|
2497
|
+
chunking: false,
|
|
2498
|
+
// Mistral doesn't do chunking
|
|
2499
|
+
segmentation: false,
|
|
2500
|
+
// Mistral doesn't do segmentation
|
|
2501
|
+
stripExistingOCR: false,
|
|
2502
|
+
formatLines: false,
|
|
2503
|
+
forceOCR: true,
|
|
2504
|
+
// OCR 3 always does OCR
|
|
2505
|
+
tableOutputFormats: opts.tableFormat ?? isOCR,
|
|
2506
|
+
// html or markdown table format
|
|
2507
|
+
tableMerging: false,
|
|
2508
|
+
confidence: false,
|
|
2509
|
+
// Mistral doesn't provide confidence scores
|
|
2510
|
+
boundingBoxes: false,
|
|
2511
|
+
// Mistral does NOT provide text-level bounding boxes
|
|
2512
|
+
imageBoundingBoxes: true,
|
|
2513
|
+
// Mistral provides image/figure bounding boxes only
|
|
2514
|
+
schemaValidation: d.outputFormat?.features?.schemaValidation ?? isVLM,
|
|
2515
|
+
// VLM supports schema
|
|
2516
|
+
handwrittenText: d.outputFormat?.features?.handwrittenText ?? true,
|
|
2517
|
+
// Excellent handwriting support
|
|
2518
|
+
headerFooterExtraction: opts.extractHeader ?? opts.extractFooter ?? false,
|
|
2519
|
+
// extract_header/extract_footer
|
|
2520
|
+
// Extended features
|
|
2521
|
+
embedOptimized: false,
|
|
2522
|
+
passwordProtected: false,
|
|
2523
|
+
contentFiltering: false,
|
|
2524
|
+
ocrMode: false,
|
|
2525
|
+
webhookCallback: false,
|
|
2526
|
+
// Mistral is synchronous
|
|
2527
|
+
mediaResolution: false,
|
|
2528
|
+
changeTracking: false,
|
|
2529
|
+
hyperlinkExtraction: true,
|
|
2530
|
+
// Response pages[].hyperlinks[] auto-extracted
|
|
2531
|
+
chartUnderstanding: false,
|
|
2532
|
+
// Not available as separate feature in Mistral
|
|
2533
|
+
imageCaptions: false,
|
|
2534
|
+
// Not available in Mistral
|
|
2535
|
+
signatureExtraction: false,
|
|
2536
|
+
// Not available in Mistral
|
|
2537
|
+
commentExtraction: false,
|
|
2538
|
+
// Not available in Mistral
|
|
2539
|
+
highlightExtraction: false,
|
|
2540
|
+
// Not available in Mistral
|
|
2541
|
+
figureSummaries: false,
|
|
2542
|
+
// Not available in Mistral
|
|
2543
|
+
outputFormats
|
|
2544
|
+
};
|
|
2545
|
+
return {
|
|
2546
|
+
id: d.id ?? id,
|
|
2547
|
+
name: d.name ?? id,
|
|
2548
|
+
source: "mistral",
|
|
2549
|
+
type: d.type ?? "OCR",
|
|
2550
|
+
// 3-layer identity
|
|
2551
|
+
identity: {
|
|
2552
|
+
provider: "mistral",
|
|
2553
|
+
model,
|
|
2554
|
+
method: "native"
|
|
2555
|
+
},
|
|
2556
|
+
capabilities: {
|
|
2557
|
+
supportsImages: d.capabilities?.supportsImages ?? true,
|
|
2558
|
+
supportsPDFs: d.capabilities?.supportsPDFs ?? true,
|
|
2559
|
+
supportsDocuments: d.capabilities?.supportsDocuments ?? true,
|
|
2560
|
+
// Supports DOCX, PPTX, TXT, EPUB, RTF, ODT, etc. (NOT XLSX)
|
|
2561
|
+
supportsReasoning: false,
|
|
2562
|
+
// OCR 3 doesn't do reasoning
|
|
2563
|
+
supportsStructuredOutput: d.capabilities?.supportsStructuredOutput ?? isVLM,
|
|
2564
|
+
// Extended capabilities
|
|
2565
|
+
supportsPrompts: false,
|
|
2566
|
+
supportsCitations: false,
|
|
2567
|
+
supportsChunking: false,
|
|
2568
|
+
supportsImageExtraction: opts.includeImageBase64 ?? false,
|
|
2569
|
+
supportsPageMarkers: false,
|
|
2570
|
+
supportsLanguageHints: false,
|
|
2571
|
+
supportsProcessingModes: false,
|
|
2572
|
+
supportsSegmentation: false,
|
|
2573
|
+
outputFormats
|
|
2574
|
+
},
|
|
2575
|
+
features,
|
|
2576
|
+
// Mistral providers always need raw document input
|
|
2577
|
+
inputRequirements: {
|
|
2578
|
+
inputType: d.inputRequirements?.inputType ?? "raw-document",
|
|
2579
|
+
acceptedMethods: d.inputRequirements?.acceptedMethods ?? ["base64", "url"]
|
|
2580
|
+
},
|
|
2581
|
+
compatibleNodes: {
|
|
2582
|
+
parse: d.compatibleNodes?.parse ?? isOCR,
|
|
2583
|
+
extract: d.compatibleNodes?.extract ?? isVLM,
|
|
2584
|
+
categorize: d.compatibleNodes?.categorize ?? false,
|
|
2585
|
+
qualify: d.compatibleNodes?.qualify ?? false,
|
|
2586
|
+
split: d.compatibleNodes?.split ?? false
|
|
2587
|
+
},
|
|
2588
|
+
inputFormats: {
|
|
2589
|
+
imageMimeTypes: (d.inputFormats?.mimeTypes ?? []).filter((m) => m.startsWith("image/")),
|
|
2590
|
+
documentMimeTypes: (d.inputFormats?.mimeTypes ?? []).filter((m) => !m.startsWith("image/")),
|
|
2591
|
+
inputMethods: d.inputFormats?.inputMethods ?? ["base64", "url"],
|
|
2592
|
+
maxFileSize: d.inputFormats?.maxFileSize ?? 50,
|
|
2593
|
+
// 50MB limit
|
|
2594
|
+
maxPages: d.inputFormats?.maxPages ?? 1e3
|
|
2595
|
+
},
|
|
2596
|
+
pricing: {
|
|
2597
|
+
model: "per-page",
|
|
2598
|
+
perPage: d.pricing?.perPage ?? 2e-3,
|
|
2599
|
+
// $2/1000 pages
|
|
2600
|
+
currency: "USD",
|
|
2601
|
+
notes: d.pricing?.notes ?? "$2 per 1000 pages"
|
|
2602
|
+
},
|
|
2603
|
+
rateLimits: {
|
|
2604
|
+
docsPerMinute: d.apiConfig?.rateLimit?.docsPerMinute
|
|
2605
|
+
},
|
|
2606
|
+
raw: d
|
|
2607
|
+
};
|
|
2608
|
+
}
|
|
2291
2609
|
|
|
2292
2610
|
// src/retry.ts
|
|
2293
2611
|
var DEFAULT_RETRY_CONFIG = {
|
|
@@ -2507,6 +2825,7 @@ export {
|
|
|
2507
2825
|
getNodeTypeName,
|
|
2508
2826
|
getPDFPageCount,
|
|
2509
2827
|
getPageCountMetadata,
|
|
2828
|
+
getPageIndexing,
|
|
2510
2829
|
getProviderById,
|
|
2511
2830
|
getProvidersBySource,
|
|
2512
2831
|
getProvidersForLargeFiles,
|
|
@@ -2514,6 +2833,7 @@ export {
|
|
|
2514
2833
|
getSuggestedConnections,
|
|
2515
2834
|
getTotalPageCount,
|
|
2516
2835
|
getValidForEachStarters,
|
|
2836
|
+
isFeatureEnabled,
|
|
2517
2837
|
isLocalEndpoint,
|
|
2518
2838
|
isPDFDocument,
|
|
2519
2839
|
isRetryableError,
|
|
@@ -2525,11 +2845,13 @@ export {
|
|
|
2525
2845
|
queryProviders,
|
|
2526
2846
|
registerProviderMetadata,
|
|
2527
2847
|
registerProviderWithModels,
|
|
2848
|
+
requiresMaxPagesTransformation,
|
|
2528
2849
|
resolveDocument,
|
|
2529
2850
|
resolveModelMetadata,
|
|
2530
2851
|
runPipeline,
|
|
2531
2852
|
splitPDFIntoChunks,
|
|
2532
2853
|
toProviderString,
|
|
2854
|
+
transformDerivedFeatures,
|
|
2533
2855
|
validateFlowInputFormat,
|
|
2534
2856
|
validateJson,
|
|
2535
2857
|
validateMimeType,
|