@doclo/core 0.1.12 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +128 -24
- package/dist/index.js +228 -34
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.d.ts
CHANGED
|
@@ -511,6 +511,22 @@ type OutputFormatSupport = {
|
|
|
511
511
|
html: boolean;
|
|
512
512
|
json: boolean;
|
|
513
513
|
};
|
|
514
|
+
/**
|
|
515
|
+
* Feature status values for normalized features.
|
|
516
|
+
* - `true`: Natively supported by the API
|
|
517
|
+
* - `false`: Not supported
|
|
518
|
+
* - `'deprecated'`: API deprecated this feature, may not work
|
|
519
|
+
* - `'derived'`: SDK provides via transformation (e.g., maxPages from pageRange)
|
|
520
|
+
*/
|
|
521
|
+
type FeatureStatus = true | false | 'deprecated' | 'derived';
|
|
522
|
+
/**
|
|
523
|
+
* Helper to check if a feature is enabled (true, deprecated, or derived)
|
|
524
|
+
*/
|
|
525
|
+
declare function isFeatureEnabled(status: FeatureStatus): boolean;
|
|
526
|
+
/**
|
|
527
|
+
* Page indexing convention used by provider
|
|
528
|
+
*/
|
|
529
|
+
type PageIndexing = '0-indexed' | '1-indexed';
|
|
514
530
|
/**
|
|
515
531
|
* Normalized features across all providers.
|
|
516
532
|
* Maps provider-specific option names to unified names.
|
|
@@ -520,47 +536,77 @@ type OutputFormatSupport = {
|
|
|
520
536
|
*/
|
|
521
537
|
type NormalizedFeatures = {
|
|
522
538
|
/** Limit to first N pages */
|
|
523
|
-
maxPages:
|
|
539
|
+
maxPages: FeatureStatus;
|
|
524
540
|
/** Specific page range selection */
|
|
525
|
-
pageRange:
|
|
541
|
+
pageRange: FeatureStatus;
|
|
526
542
|
/** OCR language hints (maps from 'langs') */
|
|
527
|
-
languageHints:
|
|
543
|
+
languageHints: FeatureStatus;
|
|
528
544
|
/** Quality/speed modes (fast/balanced/high_accuracy) */
|
|
529
|
-
processingModes:
|
|
545
|
+
processingModes: FeatureStatus;
|
|
530
546
|
/** Reducto agentic mode (higher accuracy, more cost) */
|
|
531
|
-
agenticMode:
|
|
547
|
+
agenticMode: FeatureStatus;
|
|
532
548
|
/** Custom prompts (maps from blockCorrectionPrompt, additionalPrompt, systemPrompt) */
|
|
533
|
-
customPrompts:
|
|
549
|
+
customPrompts: FeatureStatus;
|
|
534
550
|
/** Extract embedded images (maps from extractImages, returnImages) */
|
|
535
|
-
imageExtraction:
|
|
551
|
+
imageExtraction: FeatureStatus;
|
|
536
552
|
/** Page delimiters (maps from paginate, addPageMarkers) */
|
|
537
|
-
pageMarkers:
|
|
538
|
-
/** Field-level citations with source references */
|
|
539
|
-
citations:
|
|
553
|
+
pageMarkers: FeatureStatus;
|
|
554
|
+
/** Field-level citations with source references (page/char/block indices) */
|
|
555
|
+
citations: FeatureStatus;
|
|
540
556
|
/** Document chunking modes (RAG-optimized) */
|
|
541
|
-
chunking:
|
|
557
|
+
chunking: FeatureStatus;
|
|
542
558
|
/** Auto-segmentation for multi-document PDFs */
|
|
543
|
-
segmentation:
|
|
559
|
+
segmentation: FeatureStatus;
|
|
544
560
|
/** Re-run OCR on already-OCR'd documents */
|
|
545
|
-
stripExistingOCR:
|
|
561
|
+
stripExistingOCR: FeatureStatus;
|
|
546
562
|
/** Format lines in output */
|
|
547
|
-
formatLines:
|
|
563
|
+
formatLines: FeatureStatus;
|
|
548
564
|
/** Force OCR even if text layer exists */
|
|
549
|
-
forceOCR:
|
|
565
|
+
forceOCR: FeatureStatus;
|
|
550
566
|
/** Table format options (html/json/md/csv) */
|
|
551
|
-
tableOutputFormats:
|
|
567
|
+
tableOutputFormats: FeatureStatus;
|
|
552
568
|
/** Merge consecutive tables */
|
|
553
|
-
tableMerging:
|
|
569
|
+
tableMerging: FeatureStatus;
|
|
554
570
|
/** Block-level confidence scores */
|
|
555
|
-
confidence:
|
|
556
|
-
/** Bounding box coordinates for
|
|
557
|
-
boundingBoxes:
|
|
571
|
+
confidence: FeatureStatus;
|
|
572
|
+
/** Bounding box coordinates for TEXT elements (pixel/normalized coords) */
|
|
573
|
+
boundingBoxes: FeatureStatus;
|
|
574
|
+
/** Bounding box coordinates for IMAGES/FIGURES only (not text) */
|
|
575
|
+
imageBoundingBoxes: FeatureStatus;
|
|
558
576
|
/** JSON schema validation for structured output */
|
|
559
|
-
schemaValidation:
|
|
577
|
+
schemaValidation: FeatureStatus;
|
|
560
578
|
/** Handwritten text recognition support */
|
|
561
|
-
handwrittenText:
|
|
579
|
+
handwrittenText: FeatureStatus;
|
|
562
580
|
/** Separate header/footer extraction from main content */
|
|
563
|
-
headerFooterExtraction:
|
|
581
|
+
headerFooterExtraction: FeatureStatus;
|
|
582
|
+
/** Optimize output for embeddings/RAG */
|
|
583
|
+
embedOptimized: FeatureStatus;
|
|
584
|
+
/** Handle encrypted/password-protected PDFs */
|
|
585
|
+
passwordProtected: FeatureStatus;
|
|
586
|
+
/** Filter block types (headers, footers, page numbers, etc.) */
|
|
587
|
+
contentFiltering: FeatureStatus;
|
|
588
|
+
/** OCR system/mode selection (standard/legacy, auto/full) */
|
|
589
|
+
ocrMode: FeatureStatus;
|
|
590
|
+
/** Async completion webhook callbacks */
|
|
591
|
+
webhookCallback: FeatureStatus;
|
|
592
|
+
/** Vision quality control (low/medium/high) - Gemini */
|
|
593
|
+
mediaResolution: FeatureStatus;
|
|
594
|
+
/** Track changes extraction from Word docs */
|
|
595
|
+
changeTracking: FeatureStatus;
|
|
596
|
+
/** Extract hyperlinks from documents */
|
|
597
|
+
hyperlinkExtraction: FeatureStatus;
|
|
598
|
+
/** Enhanced chart and graph interpretation (Datalab extras=chart_understanding) */
|
|
599
|
+
chartUnderstanding: FeatureStatus;
|
|
600
|
+
/** Control image caption generation (Datalab disable_image_captions) */
|
|
601
|
+
imageCaptions: FeatureStatus;
|
|
602
|
+
/** Extract signatures from documents (Reducto include: ["signatures"]) */
|
|
603
|
+
signatureExtraction: FeatureStatus;
|
|
604
|
+
/** Extract comments/annotations from documents (Reducto include: ["comments"]) */
|
|
605
|
+
commentExtraction: FeatureStatus;
|
|
606
|
+
/** Extract highlighted text from documents (Reducto include: ["highlight"]) */
|
|
607
|
+
highlightExtraction: FeatureStatus;
|
|
608
|
+
/** Summarize figures/charts with VLM (Reducto summarize_figures) */
|
|
609
|
+
figureSummaries: FeatureStatus;
|
|
564
610
|
/** Supported output formats */
|
|
565
611
|
outputFormats: OutputFormatSupport;
|
|
566
612
|
};
|
|
@@ -929,6 +975,64 @@ declare function getAllModels(): ResolvedModelMetadata[];
|
|
|
929
975
|
* Clear model registry (useful for testing)
|
|
930
976
|
*/
|
|
931
977
|
declare function clearModelRegistry(): void;
|
|
978
|
+
/**
|
|
979
|
+
* Get the page indexing convention for a provider.
|
|
980
|
+
*
|
|
981
|
+
* @param provider - Provider metadata or source string
|
|
982
|
+
* @returns Page indexing convention ('0-indexed' or '1-indexed')
|
|
983
|
+
*/
|
|
984
|
+
declare function getPageIndexing(provider: NormalizedProviderMetadata | string): PageIndexing;
|
|
985
|
+
/**
|
|
986
|
+
* Options that can be transformed for derived features.
|
|
987
|
+
*/
|
|
988
|
+
type DerivedFeatureOptions = {
|
|
989
|
+
maxPages?: number;
|
|
990
|
+
pageRange?: string;
|
|
991
|
+
};
|
|
992
|
+
/**
|
|
993
|
+
* Result of derived feature transformation.
|
|
994
|
+
*/
|
|
995
|
+
type TransformedOptions = {
|
|
996
|
+
/** The transformed page_range parameter (provider-specific format) */
|
|
997
|
+
page_range?: string;
|
|
998
|
+
/** Array format for providers that support it (e.g., Mistral) */
|
|
999
|
+
pages?: number[];
|
|
1000
|
+
/** Original options minus the derived ones */
|
|
1001
|
+
remainingOptions: Record<string, unknown>;
|
|
1002
|
+
};
|
|
1003
|
+
/**
|
|
1004
|
+
* Transform maxPages to provider-specific pageRange format.
|
|
1005
|
+
*
|
|
1006
|
+
* This utility handles the conversion when a provider has `maxPages: 'derived'`,
|
|
1007
|
+
* meaning the SDK provides maxPages functionality via the underlying pageRange API.
|
|
1008
|
+
*
|
|
1009
|
+
* @param options - User-provided options including maxPages
|
|
1010
|
+
* @param provider - Provider metadata to determine format
|
|
1011
|
+
* @returns Transformed options with provider-specific pageRange
|
|
1012
|
+
*
|
|
1013
|
+
* @example
|
|
1014
|
+
* ```typescript
|
|
1015
|
+
* // User wants first 5 pages from Reducto (1-indexed)
|
|
1016
|
+
* const result = transformDerivedFeatures({ maxPages: 5 }, reductoProvider);
|
|
1017
|
+
* // => { page_range: '1-5', remainingOptions: {} }
|
|
1018
|
+
*
|
|
1019
|
+
* // User wants first 5 pages from Datalab (0-indexed)
|
|
1020
|
+
* const result = transformDerivedFeatures({ maxPages: 5 }, datalabProvider);
|
|
1021
|
+
* // => { page_range: '0-4', remainingOptions: {} }
|
|
1022
|
+
*
|
|
1023
|
+
* // User wants first 5 pages from Mistral (0-indexed, array format)
|
|
1024
|
+
* const result = transformDerivedFeatures({ maxPages: 5 }, mistralProvider);
|
|
1025
|
+
* // => { page_range: '0-4', pages: [0,1,2,3,4], remainingOptions: {} }
|
|
1026
|
+
* ```
|
|
1027
|
+
*/
|
|
1028
|
+
declare function transformDerivedFeatures(options: DerivedFeatureOptions & Record<string, unknown>, provider: NormalizedProviderMetadata): TransformedOptions;
|
|
1029
|
+
/**
|
|
1030
|
+
* Check if a provider requires derived feature transformation for maxPages.
|
|
1031
|
+
*
|
|
1032
|
+
* @param provider - Provider metadata
|
|
1033
|
+
* @returns true if maxPages needs to be transformed to pageRange
|
|
1034
|
+
*/
|
|
1035
|
+
declare function requiresMaxPagesTransformation(provider: NormalizedProviderMetadata): boolean;
|
|
932
1036
|
|
|
933
1037
|
/**
|
|
934
1038
|
* @doclo/core - Retry Utilities
|
|
@@ -1090,4 +1194,4 @@ declare function getCircuitBreaker(key: string): CircuitBreaker | undefined;
|
|
|
1090
1194
|
*/
|
|
1091
1195
|
declare function withRetry<T>(fn: () => Promise<T>, options?: WithRetryOptions<T>): Promise<T>;
|
|
1092
1196
|
|
|
1093
|
-
export { type AcceptedMimeType, AccessMethod, type AllAutoVariables, type AutoVariablesForNode, type BaseProviderConfig, type CategorizeAutoVariables, type CircuitBreaker, type CircuitBreakerConfig, type CircuitBreakerState, DEFAULT_CIRCUIT_BREAKER_CONFIG, DEFAULT_RETRY_CONFIG, type DocumentMimeType, type ExtractAutoVariables, type FeatureName, FlowInputValidationError, type InputRequirements, type ModelMetadata, type ModelQueryFilter, type NormalizedCapabilities, type NormalizedFeatures, type NormalizedProviderMetadata, type OCRProviderConfig, type OutputFormatSupport, type ParseAutoVariables, type PromptVariables, type ProviderConfig, type ProviderInputType, type ProviderInstance, type ProviderMetadataWithModels, type ProviderQueryFilter, type ProviderRegistry, type ProviderSecrets, ProviderVendor, type ResolvedModelMetadata, type RetryConfig, type VLMProviderConfig, type WithRetryOptions, bufferToBase64, bufferToDataUri, buildProviderFromConfig, buildProvidersFromConfigs, calculateRetryDelay, clearCircuitBreakers, clearModelRegistry, clearProviderRegistry, createCircuitBreaker, defineMarkerProvider, defineSuryaProvider, defineVLMProvider, detectDocumentType, detectMimeTypeFromBase64, detectMimeTypeFromBase64Async, detectMimeTypeFromBytes, extractBase64, extractStatusCode, getAllModels, getAllProviders, getCheapestProviderFor, getCircuitBreaker, getModelsForNode, getProviderById, getProvidersBySource, getProvidersForLargeFiles, getProvidersForMimeType, isPDFDocument, isRetryableError, parseRetryAfter, queryModels, queryProviders, registerProviderMetadata, registerProviderWithModels, resolveDocument, resolveModelMetadata, validateFlowInputFormat, validateMimeType, validateMimeTypeAsync, withRetry };
|
|
1197
|
+
export { type AcceptedMimeType, AccessMethod, type AllAutoVariables, type AutoVariablesForNode, type BaseProviderConfig, type CategorizeAutoVariables, type CircuitBreaker, type CircuitBreakerConfig, type CircuitBreakerState, DEFAULT_CIRCUIT_BREAKER_CONFIG, DEFAULT_RETRY_CONFIG, type DerivedFeatureOptions, type DocumentMimeType, type ExtractAutoVariables, type FeatureName, type FeatureStatus, FlowInputValidationError, type InputRequirements, type ModelMetadata, type ModelQueryFilter, type NormalizedCapabilities, type NormalizedFeatures, type NormalizedProviderMetadata, type OCRProviderConfig, type OutputFormatSupport, type PageIndexing, type ParseAutoVariables, type PromptVariables, type ProviderConfig, type ProviderInputType, type ProviderInstance, type ProviderMetadataWithModels, type ProviderQueryFilter, type ProviderRegistry, type ProviderSecrets, ProviderVendor, type ResolvedModelMetadata, type RetryConfig, type TransformedOptions, type VLMProviderConfig, type WithRetryOptions, bufferToBase64, bufferToDataUri, buildProviderFromConfig, buildProvidersFromConfigs, calculateRetryDelay, clearCircuitBreakers, clearModelRegistry, clearProviderRegistry, createCircuitBreaker, defineMarkerProvider, defineSuryaProvider, defineVLMProvider, detectDocumentType, detectMimeTypeFromBase64, detectMimeTypeFromBase64Async, detectMimeTypeFromBytes, extractBase64, extractStatusCode, getAllModels, getAllProviders, getCheapestProviderFor, getCircuitBreaker, getModelsForNode, getPageIndexing, getProviderById, getProvidersBySource, getProvidersForLargeFiles, getProvidersForMimeType, isFeatureEnabled, isPDFDocument, isRetryableError, parseRetryAfter, queryModels, queryProviders, registerProviderMetadata, registerProviderWithModels, requiresMaxPagesTransformation, resolveDocument, resolveModelMetadata, transformDerivedFeatures, validateFlowInputFormat, validateMimeType, validateMimeTypeAsync, withRetry };
|
package/dist/index.js
CHANGED
|
@@ -1398,6 +1398,9 @@ function createIdentity(provider, model, opts) {
|
|
|
1398
1398
|
}
|
|
1399
1399
|
|
|
1400
1400
|
// src/provider-query.ts
|
|
1401
|
+
function isFeatureEnabled(status) {
|
|
1402
|
+
return status === true || status === "deprecated" || status === "derived";
|
|
1403
|
+
}
|
|
1401
1404
|
var providerRegistry = /* @__PURE__ */ new Map();
|
|
1402
1405
|
function registerProviderMetadata(source, metadata, normalizer) {
|
|
1403
1406
|
const normalized = /* @__PURE__ */ new Map();
|
|
@@ -1482,7 +1485,7 @@ function queryProviders(filter = {}) {
|
|
|
1482
1485
|
}
|
|
1483
1486
|
if (filter.hasFeatures && filter.hasFeatures.length > 0) {
|
|
1484
1487
|
providers = providers.filter(
|
|
1485
|
-
(p) => filter.hasFeatures.every((feature) => p.features[feature]
|
|
1488
|
+
(p) => filter.hasFeatures.every((feature) => isFeatureEnabled(p.features[feature]))
|
|
1486
1489
|
);
|
|
1487
1490
|
}
|
|
1488
1491
|
if (filter.outputFormat) {
|
|
@@ -1581,9 +1584,25 @@ function defaultNormalizer(id, data, source) {
|
|
|
1581
1584
|
tableMerging: false,
|
|
1582
1585
|
confidence: false,
|
|
1583
1586
|
boundingBoxes: false,
|
|
1587
|
+
imageBoundingBoxes: false,
|
|
1584
1588
|
schemaValidation: false,
|
|
1585
1589
|
handwrittenText: false,
|
|
1586
1590
|
headerFooterExtraction: false,
|
|
1591
|
+
// Extended features
|
|
1592
|
+
embedOptimized: false,
|
|
1593
|
+
passwordProtected: false,
|
|
1594
|
+
contentFiltering: false,
|
|
1595
|
+
ocrMode: false,
|
|
1596
|
+
webhookCallback: false,
|
|
1597
|
+
mediaResolution: false,
|
|
1598
|
+
changeTracking: false,
|
|
1599
|
+
hyperlinkExtraction: false,
|
|
1600
|
+
chartUnderstanding: false,
|
|
1601
|
+
imageCaptions: false,
|
|
1602
|
+
signatureExtraction: false,
|
|
1603
|
+
commentExtraction: false,
|
|
1604
|
+
highlightExtraction: false,
|
|
1605
|
+
figureSummaries: false,
|
|
1587
1606
|
outputFormats: defaultOutputFormats
|
|
1588
1607
|
};
|
|
1589
1608
|
return {
|
|
@@ -1638,10 +1657,12 @@ function normalizeLLMProvider(id, d) {
|
|
|
1638
1657
|
html: true,
|
|
1639
1658
|
json: d.capabilities?.supportsStructuredOutput ?? true
|
|
1640
1659
|
};
|
|
1660
|
+
const vendor = d.vendor ?? id;
|
|
1641
1661
|
const features = {
|
|
1642
|
-
maxPages:
|
|
1643
|
-
|
|
1644
|
-
|
|
1662
|
+
maxPages: "derived",
|
|
1663
|
+
// SDK can limit via pre-processing
|
|
1664
|
+
pageRange: false,
|
|
1665
|
+
// No native API support - LLMs receive full text
|
|
1645
1666
|
languageHints: false,
|
|
1646
1667
|
// Not applicable to LLMs
|
|
1647
1668
|
processingModes: false,
|
|
@@ -1654,8 +1675,8 @@ function normalizeLLMProvider(id, d) {
|
|
|
1654
1675
|
// LLMs don't extract images
|
|
1655
1676
|
pageMarkers: false,
|
|
1656
1677
|
// LLMs don't add page markers
|
|
1657
|
-
citations: false,
|
|
1658
|
-
//
|
|
1678
|
+
citations: vendor === "anthropic" ? true : false,
|
|
1679
|
+
// Anthropic has Citations API
|
|
1659
1680
|
chunking: false,
|
|
1660
1681
|
// LLMs don't do chunking
|
|
1661
1682
|
segmentation: false,
|
|
@@ -1669,15 +1690,32 @@ function normalizeLLMProvider(id, d) {
|
|
|
1669
1690
|
// LLMs don't provide confidence scores
|
|
1670
1691
|
boundingBoxes: false,
|
|
1671
1692
|
// LLMs don't provide bounding boxes
|
|
1693
|
+
imageBoundingBoxes: false,
|
|
1694
|
+
// LLMs don't provide image bounding boxes (Gemini 2.0+ can via specific prompting, but not a simple toggle)
|
|
1672
1695
|
schemaValidation: d.capabilities?.supportsStructuredOutput ?? false,
|
|
1673
1696
|
// Some LLMs support schema validation
|
|
1674
1697
|
handwrittenText: false,
|
|
1675
1698
|
// Not specific to LLMs
|
|
1676
1699
|
headerFooterExtraction: false,
|
|
1677
1700
|
// LLMs don't extract header/footer separately
|
|
1701
|
+
// Extended features
|
|
1702
|
+
embedOptimized: false,
|
|
1703
|
+
passwordProtected: false,
|
|
1704
|
+
contentFiltering: false,
|
|
1705
|
+
ocrMode: false,
|
|
1706
|
+
webhookCallback: false,
|
|
1707
|
+
mediaResolution: vendor === "google" ? true : false,
|
|
1708
|
+
// Google Gemini has mediaResolution
|
|
1709
|
+
changeTracking: false,
|
|
1710
|
+
hyperlinkExtraction: false,
|
|
1711
|
+
chartUnderstanding: false,
|
|
1712
|
+
imageCaptions: false,
|
|
1713
|
+
signatureExtraction: false,
|
|
1714
|
+
commentExtraction: false,
|
|
1715
|
+
highlightExtraction: false,
|
|
1716
|
+
figureSummaries: false,
|
|
1678
1717
|
outputFormats
|
|
1679
1718
|
};
|
|
1680
|
-
const vendor = d.vendor ?? id;
|
|
1681
1719
|
return {
|
|
1682
1720
|
id,
|
|
1683
1721
|
name: d.name ?? id,
|
|
@@ -1698,7 +1736,8 @@ function normalizeLLMProvider(id, d) {
|
|
|
1698
1736
|
supportsStructuredOutput: d.capabilities?.supportsStructuredOutput ?? false,
|
|
1699
1737
|
// NEW capabilities
|
|
1700
1738
|
supportsPrompts: true,
|
|
1701
|
-
supportsCitations:
|
|
1739
|
+
supportsCitations: vendor === "anthropic",
|
|
1740
|
+
// Anthropic has Citations API
|
|
1702
1741
|
supportsChunking: false,
|
|
1703
1742
|
supportsImageExtraction: false,
|
|
1704
1743
|
supportsPageMarkers: false,
|
|
@@ -1745,6 +1784,8 @@ function normalizeLLMProvider(id, d) {
|
|
|
1745
1784
|
function normalizeDatalabProvider(id, d) {
|
|
1746
1785
|
const opts = d.supportedOptions ?? {};
|
|
1747
1786
|
const isVLM = d.type === "VLM";
|
|
1787
|
+
const isMarkerOCR = id === "marker-ocr" || id.includes("marker-ocr");
|
|
1788
|
+
const isMarkerVLM = id === "marker-vlm" || id.includes("marker-vlm");
|
|
1748
1789
|
const model = d.model ?? id;
|
|
1749
1790
|
const outputFormats = {
|
|
1750
1791
|
text: true,
|
|
@@ -1755,35 +1796,61 @@ function normalizeDatalabProvider(id, d) {
|
|
|
1755
1796
|
const features = {
|
|
1756
1797
|
maxPages: opts.maxPages ?? false,
|
|
1757
1798
|
pageRange: opts.pageRange ?? false,
|
|
1758
|
-
languageHints: opts.langs
|
|
1759
|
-
//
|
|
1799
|
+
languageHints: opts.langs ? "deprecated" : false,
|
|
1800
|
+
// API ignores, handled automatically
|
|
1760
1801
|
processingModes: opts.mode ?? false,
|
|
1761
1802
|
agenticMode: false,
|
|
1762
1803
|
// Datalab doesn't have agentic mode
|
|
1763
|
-
customPrompts: opts.blockCorrectionPrompt
|
|
1804
|
+
customPrompts: opts.blockCorrectionPrompt ? "deprecated" : false,
|
|
1805
|
+
// Not currently supported
|
|
1764
1806
|
imageExtraction: opts.extractImages ?? false,
|
|
1765
1807
|
pageMarkers: opts.paginate ?? false,
|
|
1766
1808
|
// maps from 'paginate'
|
|
1767
|
-
citations:
|
|
1809
|
+
citations: isMarkerVLM ? true : false,
|
|
1810
|
+
// Marker VLM has citations
|
|
1768
1811
|
chunking: false,
|
|
1769
1812
|
// Datalab doesn't have chunking
|
|
1770
1813
|
segmentation: opts.segmentation ?? false,
|
|
1771
|
-
stripExistingOCR: opts.stripExistingOCR
|
|
1772
|
-
|
|
1773
|
-
|
|
1774
|
-
//
|
|
1814
|
+
stripExistingOCR: opts.stripExistingOCR ? "deprecated" : false,
|
|
1815
|
+
// Managed automatically
|
|
1816
|
+
formatLines: opts.formatLines ? "deprecated" : false,
|
|
1817
|
+
// Handled automatically
|
|
1818
|
+
forceOCR: "deprecated",
|
|
1819
|
+
// DEPRECATED: force_ocr param has no effect per API docs
|
|
1775
1820
|
tableOutputFormats: false,
|
|
1776
1821
|
tableMerging: false,
|
|
1777
1822
|
confidence: false,
|
|
1778
1823
|
// Datalab doesn't provide confidence scores
|
|
1779
1824
|
boundingBoxes: d.outputFormat?.features?.boundingBoxes ?? true,
|
|
1780
|
-
// Datalab provides
|
|
1825
|
+
// Datalab Surya provides text bboxes
|
|
1826
|
+
imageBoundingBoxes: isMarkerOCR || isMarkerVLM ? true : false,
|
|
1827
|
+
// Marker extracts images with bboxes
|
|
1781
1828
|
schemaValidation: isVLM,
|
|
1782
1829
|
// VLM providers support schema validation
|
|
1783
1830
|
handwrittenText: true,
|
|
1784
1831
|
// Datalab handles handwritten text
|
|
1785
1832
|
headerFooterExtraction: false,
|
|
1786
1833
|
// Datalab has issues with header/footer extraction
|
|
1834
|
+
// Extended features
|
|
1835
|
+
embedOptimized: false,
|
|
1836
|
+
passwordProtected: false,
|
|
1837
|
+
contentFiltering: false,
|
|
1838
|
+
ocrMode: false,
|
|
1839
|
+
webhookCallback: true,
|
|
1840
|
+
// Datalab supports webhook callbacks
|
|
1841
|
+
mediaResolution: false,
|
|
1842
|
+
changeTracking: true,
|
|
1843
|
+
// Datalab marker_extras supports track_changes
|
|
1844
|
+
hyperlinkExtraction: isMarkerOCR || isMarkerVLM,
|
|
1845
|
+
// Datalab extras=extract_links
|
|
1846
|
+
chartUnderstanding: isMarkerOCR || isMarkerVLM,
|
|
1847
|
+
// Datalab extras=chart_understanding
|
|
1848
|
+
imageCaptions: isMarkerOCR || isMarkerVLM,
|
|
1849
|
+
// Datalab disable_image_captions param
|
|
1850
|
+
signatureExtraction: false,
|
|
1851
|
+
commentExtraction: false,
|
|
1852
|
+
highlightExtraction: false,
|
|
1853
|
+
figureSummaries: false,
|
|
1787
1854
|
outputFormats
|
|
1788
1855
|
};
|
|
1789
1856
|
return {
|
|
@@ -1852,6 +1919,7 @@ function normalizeReductoProvider(id, d) {
|
|
|
1852
1919
|
const opts = d.supportedOptions ?? {};
|
|
1853
1920
|
const isVLM = d.type === "VLM";
|
|
1854
1921
|
const isExtract = d.compatibleNodes?.extract === true;
|
|
1922
|
+
const isParse = d.compatibleNodes?.parse === true;
|
|
1855
1923
|
const model = d.model ?? "v1";
|
|
1856
1924
|
const outputFormats = {
|
|
1857
1925
|
text: d.outputFormat?.features?.textLines ?? true,
|
|
@@ -1861,10 +1929,11 @@ function normalizeReductoProvider(id, d) {
|
|
|
1861
1929
|
json: d.outputFormat?.features?.structuredJSON ?? isExtract
|
|
1862
1930
|
};
|
|
1863
1931
|
const features = {
|
|
1864
|
-
maxPages: opts.
|
|
1932
|
+
maxPages: opts.pageRange ?? false ? "derived" : false,
|
|
1933
|
+
// SDK derives from pageRange (1-indexed)
|
|
1865
1934
|
pageRange: opts.pageRange ?? false,
|
|
1866
|
-
languageHints:
|
|
1867
|
-
// Reducto doesn't support
|
|
1935
|
+
languageHints: false,
|
|
1936
|
+
// Reducto doesn't support language hints
|
|
1868
1937
|
processingModes: false,
|
|
1869
1938
|
// Reducto uses agentic instead
|
|
1870
1939
|
agenticMode: opts.mode ?? false,
|
|
@@ -1887,14 +1956,44 @@ function normalizeReductoProvider(id, d) {
|
|
|
1887
1956
|
// Parse has mergeTables
|
|
1888
1957
|
confidence: opts.confidence ?? d.outputFormat?.features?.confidence ?? false,
|
|
1889
1958
|
// Reducto Parse has confidence
|
|
1890
|
-
boundingBoxes: d.outputFormat?.features?.boundingBoxes ??
|
|
1891
|
-
// Reducto Parse has bounding boxes
|
|
1959
|
+
boundingBoxes: d.outputFormat?.features?.boundingBoxes ?? isParse,
|
|
1960
|
+
// Reducto Parse has text bounding boxes
|
|
1961
|
+
imageBoundingBoxes: isParse ? true : false,
|
|
1962
|
+
// Reducto Parse has figure bounding boxes
|
|
1892
1963
|
schemaValidation: d.outputFormat?.features?.schemaValidation ?? isExtract,
|
|
1893
1964
|
// Extract has schema validation
|
|
1894
1965
|
handwrittenText: false,
|
|
1895
1966
|
// Reducto doesn't specifically advertise handwriting
|
|
1896
1967
|
headerFooterExtraction: true,
|
|
1897
1968
|
// Reducto has Header/Footer block types
|
|
1969
|
+
// Extended features
|
|
1970
|
+
embedOptimized: isParse,
|
|
1971
|
+
// Reducto Parse supports retrieval.embedding_optimized: true
|
|
1972
|
+
passwordProtected: true,
|
|
1973
|
+
// Reducto handles encrypted PDFs
|
|
1974
|
+
contentFiltering: true,
|
|
1975
|
+
// Reducto can filter block types
|
|
1976
|
+
ocrMode: opts.ocrSystem ?? false,
|
|
1977
|
+
// Reducto has ocr_system selection
|
|
1978
|
+
webhookCallback: true,
|
|
1979
|
+
// Reducto supports webhook callbacks
|
|
1980
|
+
mediaResolution: false,
|
|
1981
|
+
changeTracking: true,
|
|
1982
|
+
// Reducto tracks changes in Word docs
|
|
1983
|
+
hyperlinkExtraction: true,
|
|
1984
|
+
// Reducto extracts hyperlinks via formatting.include
|
|
1985
|
+
chartUnderstanding: isParse,
|
|
1986
|
+
// Reducto enhance.agentic[].advanced_chart_agent for figures
|
|
1987
|
+
imageCaptions: false,
|
|
1988
|
+
// Not available in Reducto
|
|
1989
|
+
signatureExtraction: false,
|
|
1990
|
+
// NOT supported - formatting.include only accepts: change_tracking, highlight, comments, hyperlinks
|
|
1991
|
+
commentExtraction: isParse || isExtract,
|
|
1992
|
+
// Reducto formatting.include: ["comments"]
|
|
1993
|
+
highlightExtraction: isParse || isExtract,
|
|
1994
|
+
// Reducto formatting.include: ["highlight"]
|
|
1995
|
+
figureSummaries: isParse,
|
|
1996
|
+
// Reducto enhance.summarize_figures
|
|
1898
1997
|
outputFormats
|
|
1899
1998
|
};
|
|
1900
1999
|
return {
|
|
@@ -1980,7 +2079,8 @@ function normalizeUnsiloedProvider(id, d) {
|
|
|
1980
2079
|
// Unsiloed doesn't have page range option
|
|
1981
2080
|
languageHints: false,
|
|
1982
2081
|
// Unsiloed doesn't support language hints
|
|
1983
|
-
processingModes:
|
|
2082
|
+
processingModes: false,
|
|
2083
|
+
// Unsiloed doesn't have fast/balanced/high_accuracy modes like Datalab
|
|
1984
2084
|
agenticMode: false,
|
|
1985
2085
|
// Unsiloed doesn't have agentic mode
|
|
1986
2086
|
customPrompts: false,
|
|
@@ -2002,14 +2102,40 @@ function normalizeUnsiloedProvider(id, d) {
|
|
|
2002
2102
|
tableMerging: false,
|
|
2003
2103
|
confidence: d.outputFormat?.features?.confidence ?? false,
|
|
2004
2104
|
// Unsiloed may provide confidence
|
|
2005
|
-
boundingBoxes: d.outputFormat?.features?.boundingBoxes ??
|
|
2006
|
-
// Unsiloed
|
|
2105
|
+
boundingBoxes: d.outputFormat?.features?.boundingBoxes ?? isParse,
|
|
2106
|
+
// Unsiloed Parse has bounding boxes
|
|
2107
|
+
imageBoundingBoxes: false,
|
|
2108
|
+
// Unsiloed doesn't return image-specific bboxes
|
|
2007
2109
|
schemaValidation: isExtract,
|
|
2008
2110
|
// Extract supports schema validation
|
|
2009
|
-
handwrittenText: false,
|
|
2010
|
-
//
|
|
2111
|
+
handwrittenText: d.capabilities?.specialFeatures?.includes("handwritten text") ?? false,
|
|
2112
|
+
// Parse supports handwriting
|
|
2011
2113
|
headerFooterExtraction: false,
|
|
2012
2114
|
// Unsiloed doesn't extract header/footer separately
|
|
2115
|
+
// Extended features
|
|
2116
|
+
embedOptimized: false,
|
|
2117
|
+
passwordProtected: false,
|
|
2118
|
+
contentFiltering: isParse,
|
|
2119
|
+
// Parse supports keep_segment_types: ["table", "picture", "formula", "text"]
|
|
2120
|
+
ocrMode: isParse,
|
|
2121
|
+
// Parse endpoint supports ocr_mode: 'auto_ocr' | 'full_ocr'
|
|
2122
|
+
webhookCallback: false,
|
|
2123
|
+
// Unsiloed is synchronous
|
|
2124
|
+
mediaResolution: false,
|
|
2125
|
+
changeTracking: false,
|
|
2126
|
+
hyperlinkExtraction: false,
|
|
2127
|
+
chartUnderstanding: false,
|
|
2128
|
+
// Not available in Unsiloed
|
|
2129
|
+
imageCaptions: false,
|
|
2130
|
+
// Not available in Unsiloed
|
|
2131
|
+
signatureExtraction: false,
|
|
2132
|
+
// Not available in Unsiloed
|
|
2133
|
+
commentExtraction: false,
|
|
2134
|
+
// Not available in Unsiloed
|
|
2135
|
+
highlightExtraction: false,
|
|
2136
|
+
// Not available in Unsiloed
|
|
2137
|
+
figureSummaries: false,
|
|
2138
|
+
// Not available in Unsiloed
|
|
2013
2139
|
outputFormats
|
|
2014
2140
|
};
|
|
2015
2141
|
return {
|
|
@@ -2038,7 +2164,8 @@ function normalizeUnsiloedProvider(id, d) {
|
|
|
2038
2164
|
supportsImageExtraction: false,
|
|
2039
2165
|
supportsPageMarkers: false,
|
|
2040
2166
|
supportsLanguageHints: false,
|
|
2041
|
-
supportsProcessingModes:
|
|
2167
|
+
supportsProcessingModes: false,
|
|
2168
|
+
// Unsiloed doesn't have fast/balanced/high_accuracy modes
|
|
2042
2169
|
supportsSegmentation: isSplit || isCategorize,
|
|
2043
2170
|
outputFormats
|
|
2044
2171
|
},
|
|
@@ -2256,7 +2383,7 @@ function matchesModelFilter(model, filter) {
|
|
|
2256
2383
|
}
|
|
2257
2384
|
if (filter.hasFeatures && filter.hasFeatures.length > 0) {
|
|
2258
2385
|
for (const feature of filter.hasFeatures) {
|
|
2259
|
-
if (model.features[feature]
|
|
2386
|
+
if (!isFeatureEnabled(model.features[feature])) {
|
|
2260
2387
|
return false;
|
|
2261
2388
|
}
|
|
2262
2389
|
}
|
|
@@ -2299,6 +2426,44 @@ function getAllModels() {
|
|
|
2299
2426
|
function clearModelRegistry() {
|
|
2300
2427
|
modelRegistry.clear();
|
|
2301
2428
|
}
|
|
2429
|
+
var PAGE_INDEXING = {
|
|
2430
|
+
datalab: "0-indexed",
|
|
2431
|
+
reducto: "1-indexed",
|
|
2432
|
+
mistral: "0-indexed",
|
|
2433
|
+
unsiloed: "1-indexed",
|
|
2434
|
+
// Default assumption
|
|
2435
|
+
llm: "1-indexed"
|
|
2436
|
+
// N/A but default
|
|
2437
|
+
};
|
|
2438
|
+
function getPageIndexing(provider) {
|
|
2439
|
+
const source = typeof provider === "string" ? provider : provider.source;
|
|
2440
|
+
return PAGE_INDEXING[source] ?? "1-indexed";
|
|
2441
|
+
}
|
|
2442
|
+
function transformDerivedFeatures(options, provider) {
|
|
2443
|
+
const { maxPages, pageRange, ...remainingOptions } = options;
|
|
2444
|
+
const result = { remainingOptions };
|
|
2445
|
+
if (pageRange !== void 0) {
|
|
2446
|
+
result.page_range = pageRange;
|
|
2447
|
+
return result;
|
|
2448
|
+
}
|
|
2449
|
+
if (maxPages !== void 0 && provider.features.maxPages === "derived") {
|
|
2450
|
+
const indexing = getPageIndexing(provider);
|
|
2451
|
+
if (indexing === "0-indexed") {
|
|
2452
|
+
result.page_range = `0-${maxPages - 1}`;
|
|
2453
|
+
if (provider.source === "mistral") {
|
|
2454
|
+
result.pages = Array.from({ length: maxPages }, (_, i) => i);
|
|
2455
|
+
}
|
|
2456
|
+
} else {
|
|
2457
|
+
result.page_range = `1-${maxPages}`;
|
|
2458
|
+
}
|
|
2459
|
+
} else if (maxPages !== void 0 && isFeatureEnabled(provider.features.maxPages)) {
|
|
2460
|
+
result.remainingOptions.maxPages = maxPages;
|
|
2461
|
+
}
|
|
2462
|
+
return result;
|
|
2463
|
+
}
|
|
2464
|
+
function requiresMaxPagesTransformation(provider) {
|
|
2465
|
+
return provider.features.maxPages === "derived";
|
|
2466
|
+
}
|
|
2302
2467
|
function normalizeMistralProvider(id, d) {
|
|
2303
2468
|
const opts = d.supportedOptions ?? {};
|
|
2304
2469
|
const isVLM = d.type === "VLM";
|
|
@@ -2314,7 +2479,7 @@ function normalizeMistralProvider(id, d) {
|
|
|
2314
2479
|
const features = {
|
|
2315
2480
|
maxPages: d.inputFormats?.maxPages !== void 0,
|
|
2316
2481
|
pageRange: true,
|
|
2317
|
-
// Mistral supports pages param: "0-5" or [0,2,5]
|
|
2482
|
+
// Mistral supports pages param: "0-5" or [0,2,5] (0-indexed)
|
|
2318
2483
|
languageHints: false,
|
|
2319
2484
|
// Mistral doesn't support language hints
|
|
2320
2485
|
processingModes: false,
|
|
@@ -2342,14 +2507,39 @@ function normalizeMistralProvider(id, d) {
|
|
|
2342
2507
|
tableMerging: false,
|
|
2343
2508
|
confidence: false,
|
|
2344
2509
|
// Mistral doesn't provide confidence scores
|
|
2345
|
-
boundingBoxes:
|
|
2346
|
-
//
|
|
2510
|
+
boundingBoxes: false,
|
|
2511
|
+
// Mistral does NOT provide text-level bounding boxes
|
|
2512
|
+
imageBoundingBoxes: true,
|
|
2513
|
+
// Mistral provides image/figure bounding boxes only
|
|
2347
2514
|
schemaValidation: d.outputFormat?.features?.schemaValidation ?? isVLM,
|
|
2348
2515
|
// VLM supports schema
|
|
2349
2516
|
handwrittenText: d.outputFormat?.features?.handwrittenText ?? true,
|
|
2350
2517
|
// Excellent handwriting support
|
|
2351
2518
|
headerFooterExtraction: opts.extractHeader ?? opts.extractFooter ?? false,
|
|
2352
2519
|
// extract_header/extract_footer
|
|
2520
|
+
// Extended features
|
|
2521
|
+
embedOptimized: false,
|
|
2522
|
+
passwordProtected: false,
|
|
2523
|
+
contentFiltering: false,
|
|
2524
|
+
ocrMode: false,
|
|
2525
|
+
webhookCallback: false,
|
|
2526
|
+
// Mistral is synchronous
|
|
2527
|
+
mediaResolution: false,
|
|
2528
|
+
changeTracking: false,
|
|
2529
|
+
hyperlinkExtraction: true,
|
|
2530
|
+
// Response pages[].hyperlinks[] auto-extracted
|
|
2531
|
+
chartUnderstanding: false,
|
|
2532
|
+
// Not available as separate feature in Mistral
|
|
2533
|
+
imageCaptions: false,
|
|
2534
|
+
// Not available in Mistral
|
|
2535
|
+
signatureExtraction: false,
|
|
2536
|
+
// Not available in Mistral
|
|
2537
|
+
commentExtraction: false,
|
|
2538
|
+
// Not available in Mistral
|
|
2539
|
+
highlightExtraction: false,
|
|
2540
|
+
// Not available in Mistral
|
|
2541
|
+
figureSummaries: false,
|
|
2542
|
+
// Not available in Mistral
|
|
2353
2543
|
outputFormats
|
|
2354
2544
|
};
|
|
2355
2545
|
return {
|
|
@@ -2366,8 +2556,8 @@ function normalizeMistralProvider(id, d) {
|
|
|
2366
2556
|
capabilities: {
|
|
2367
2557
|
supportsImages: d.capabilities?.supportsImages ?? true,
|
|
2368
2558
|
supportsPDFs: d.capabilities?.supportsPDFs ?? true,
|
|
2369
|
-
supportsDocuments: d.capabilities?.supportsDocuments ??
|
|
2370
|
-
// DOCX
|
|
2559
|
+
supportsDocuments: d.capabilities?.supportsDocuments ?? true,
|
|
2560
|
+
// Supports DOCX, PPTX, TXT, EPUB, RTF, ODT, etc. (NOT XLSX)
|
|
2371
2561
|
supportsReasoning: false,
|
|
2372
2562
|
// OCR 3 doesn't do reasoning
|
|
2373
2563
|
supportsStructuredOutput: d.capabilities?.supportsStructuredOutput ?? isVLM,
|
|
@@ -2635,6 +2825,7 @@ export {
|
|
|
2635
2825
|
getNodeTypeName,
|
|
2636
2826
|
getPDFPageCount,
|
|
2637
2827
|
getPageCountMetadata,
|
|
2828
|
+
getPageIndexing,
|
|
2638
2829
|
getProviderById,
|
|
2639
2830
|
getProvidersBySource,
|
|
2640
2831
|
getProvidersForLargeFiles,
|
|
@@ -2642,6 +2833,7 @@ export {
|
|
|
2642
2833
|
getSuggestedConnections,
|
|
2643
2834
|
getTotalPageCount,
|
|
2644
2835
|
getValidForEachStarters,
|
|
2836
|
+
isFeatureEnabled,
|
|
2645
2837
|
isLocalEndpoint,
|
|
2646
2838
|
isPDFDocument,
|
|
2647
2839
|
isRetryableError,
|
|
@@ -2653,11 +2845,13 @@ export {
|
|
|
2653
2845
|
queryProviders,
|
|
2654
2846
|
registerProviderMetadata,
|
|
2655
2847
|
registerProviderWithModels,
|
|
2848
|
+
requiresMaxPagesTransformation,
|
|
2656
2849
|
resolveDocument,
|
|
2657
2850
|
resolveModelMetadata,
|
|
2658
2851
|
runPipeline,
|
|
2659
2852
|
splitPDFIntoChunks,
|
|
2660
2853
|
toProviderString,
|
|
2854
|
+
transformDerivedFeatures,
|
|
2661
2855
|
validateFlowInputFormat,
|
|
2662
2856
|
validateJson,
|
|
2663
2857
|
validateMimeType,
|