npm - @doclo/core - Versions diffs - 0.1.12 → 0.2.0 - Mend

@doclo/core 0.1.12 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/dist/index.d.ts CHANGED Viewed

@@ -511,6 +511,22 @@ type OutputFormatSupport = {
     html: boolean;
     json: boolean;
 };
+/**
+ * Feature status values for normalized features.
+ * - `true`: Natively supported by the API
+ * - `false`: Not supported
+ * - `'deprecated'`: API deprecated this feature, may not work
+ * - `'derived'`: SDK provides via transformation (e.g., maxPages from pageRange)
+ */
+type FeatureStatus = true | false | 'deprecated' | 'derived';
+/**
+ * Helper to check if a feature is enabled (true, deprecated, or derived)
+ */
+declare function isFeatureEnabled(status: FeatureStatus): boolean;
+/**
+ * Page indexing convention used by provider
+ */
+type PageIndexing = '0-indexed' | '1-indexed';
 /**
  * Normalized features across all providers.
  * Maps provider-specific option names to unified names.
@@ -520,47 +536,77 @@ type OutputFormatSupport = {
  */
 type NormalizedFeatures = {
     /** Limit to first N pages */
-    maxPages: boolean;
+    maxPages: FeatureStatus;
     /** Specific page range selection */
-    pageRange: boolean;
+    pageRange: FeatureStatus;
     /** OCR language hints (maps from 'langs') */
-    languageHints: boolean;
+    languageHints: FeatureStatus;
     /** Quality/speed modes (fast/balanced/high_accuracy) */
-    processingModes: boolean;
+    processingModes: FeatureStatus;
     /** Reducto agentic mode (higher accuracy, more cost) */
-    agenticMode: boolean;
+    agenticMode: FeatureStatus;
     /** Custom prompts (maps from blockCorrectionPrompt, additionalPrompt, systemPrompt) */
-    customPrompts: boolean;
+    customPrompts: FeatureStatus;
     /** Extract embedded images (maps from extractImages, returnImages) */
-    imageExtraction: boolean;
+    imageExtraction: FeatureStatus;
     /** Page delimiters (maps from paginate, addPageMarkers) */
-    pageMarkers: boolean;
-    /** Field-level citations with source references */
-    citations: boolean;
+    pageMarkers: FeatureStatus;
+    /** Field-level citations with source references (page/char/block indices) */
+    citations: FeatureStatus;
     /** Document chunking modes (RAG-optimized) */
-    chunking: boolean;
+    chunking: FeatureStatus;
     /** Auto-segmentation for multi-document PDFs */
-    segmentation: boolean;
+    segmentation: FeatureStatus;
     /** Re-run OCR on already-OCR'd documents */
-    stripExistingOCR: boolean;
+    stripExistingOCR: FeatureStatus;
     /** Format lines in output */
-    formatLines: boolean;
+    formatLines: FeatureStatus;
     /** Force OCR even if text layer exists */
-    forceOCR: boolean;
+    forceOCR: FeatureStatus;
     /** Table format options (html/json/md/csv) */
-    tableOutputFormats: boolean;
+    tableOutputFormats: FeatureStatus;
     /** Merge consecutive tables */
-    tableMerging: boolean;
+    tableMerging: FeatureStatus;
     /** Block-level confidence scores */
-    confidence: boolean;
-    /** Bounding box coordinates for text/elements */
-    boundingBoxes: boolean;
+    confidence: FeatureStatus;
+    /** Bounding box coordinates for TEXT elements (pixel/normalized coords) */
+    boundingBoxes: FeatureStatus;
+    /** Bounding box coordinates for IMAGES/FIGURES only (not text) */
+    imageBoundingBoxes: FeatureStatus;
     /** JSON schema validation for structured output */
-    schemaValidation: boolean;
+    schemaValidation: FeatureStatus;
     /** Handwritten text recognition support */
-    handwrittenText: boolean;
+    handwrittenText: FeatureStatus;
     /** Separate header/footer extraction from main content */
-    headerFooterExtraction: boolean;
+    headerFooterExtraction: FeatureStatus;
+    /** Optimize output for embeddings/RAG */
+    embedOptimized: FeatureStatus;
+    /** Handle encrypted/password-protected PDFs */
+    passwordProtected: FeatureStatus;
+    /** Filter block types (headers, footers, page numbers, etc.) */
+    contentFiltering: FeatureStatus;
+    /** OCR system/mode selection (standard/legacy, auto/full) */
+    ocrMode: FeatureStatus;
+    /** Async completion webhook callbacks */
+    webhookCallback: FeatureStatus;
+    /** Vision quality control (low/medium/high) - Gemini */
+    mediaResolution: FeatureStatus;
+    /** Track changes extraction from Word docs */
+    changeTracking: FeatureStatus;
+    /** Extract hyperlinks from documents */
+    hyperlinkExtraction: FeatureStatus;
+    /** Enhanced chart and graph interpretation (Datalab extras=chart_understanding) */
+    chartUnderstanding: FeatureStatus;
+    /** Control image caption generation (Datalab disable_image_captions) */
+    imageCaptions: FeatureStatus;
+    /** Extract signatures from documents (Reducto include: ["signatures"]) */
+    signatureExtraction: FeatureStatus;
+    /** Extract comments/annotations from documents (Reducto include: ["comments"]) */
+    commentExtraction: FeatureStatus;
+    /** Extract highlighted text from documents (Reducto include: ["highlight"]) */
+    highlightExtraction: FeatureStatus;
+    /** Summarize figures/charts with VLM (Reducto summarize_figures) */
+    figureSummaries: FeatureStatus;
     /** Supported output formats */
     outputFormats: OutputFormatSupport;
 };
@@ -929,6 +975,64 @@ declare function getAllModels(): ResolvedModelMetadata[];
  * Clear model registry (useful for testing)
  */
 declare function clearModelRegistry(): void;
+/**
+ * Get the page indexing convention for a provider.
+ *
+ * @param provider - Provider metadata or source string
+ * @returns Page indexing convention ('0-indexed' or '1-indexed')
+ */
+declare function getPageIndexing(provider: NormalizedProviderMetadata | string): PageIndexing;
+/**
+ * Options that can be transformed for derived features.
+ */
+type DerivedFeatureOptions = {
+    maxPages?: number;
+    pageRange?: string;
+};
+/**
+ * Result of derived feature transformation.
+ */
+type TransformedOptions = {
+    /** The transformed page_range parameter (provider-specific format) */
+    page_range?: string;
+    /** Array format for providers that support it (e.g., Mistral) */
+    pages?: number[];
+    /** Original options minus the derived ones */
+    remainingOptions: Record<string, unknown>;
+};
+/**
+ * Transform maxPages to provider-specific pageRange format.
+ *
+ * This utility handles the conversion when a provider has `maxPages: 'derived'`,
+ * meaning the SDK provides maxPages functionality via the underlying pageRange API.
+ *
+ * @param options - User-provided options including maxPages
+ * @param provider - Provider metadata to determine format
+ * @returns Transformed options with provider-specific pageRange
+ *
+ * @example
+ * ```typescript
+ * // User wants first 5 pages from Reducto (1-indexed)
+ * const result = transformDerivedFeatures({ maxPages: 5 }, reductoProvider);
+ * // => { page_range: '1-5', remainingOptions: {} }
+ *
+ * // User wants first 5 pages from Datalab (0-indexed)
+ * const result = transformDerivedFeatures({ maxPages: 5 }, datalabProvider);
+ * // => { page_range: '0-4', remainingOptions: {} }
+ *
+ * // User wants first 5 pages from Mistral (0-indexed, array format)
+ * const result = transformDerivedFeatures({ maxPages: 5 }, mistralProvider);
+ * // => { page_range: '0-4', pages: [0,1,2,3,4], remainingOptions: {} }
+ * ```
+ */
+declare function transformDerivedFeatures(options: DerivedFeatureOptions & Record<string, unknown>, provider: NormalizedProviderMetadata): TransformedOptions;
+/**
+ * Check if a provider requires derived feature transformation for maxPages.
+ *
+ * @param provider - Provider metadata
+ * @returns true if maxPages needs to be transformed to pageRange
+ */
+declare function requiresMaxPagesTransformation(provider: NormalizedProviderMetadata): boolean;
 /**
  * @doclo/core - Retry Utilities
@@ -1090,4 +1194,4 @@ declare function getCircuitBreaker(key: string): CircuitBreaker | undefined;
  */
 declare function withRetry<T>(fn: () => Promise<T>, options?: WithRetryOptions<T>): Promise<T>;
-export { type AcceptedMimeType, AccessMethod, type AllAutoVariables, type AutoVariablesForNode, type BaseProviderConfig, type CategorizeAutoVariables, type CircuitBreaker, type CircuitBreakerConfig, type CircuitBreakerState, DEFAULT_CIRCUIT_BREAKER_CONFIG, DEFAULT_RETRY_CONFIG, type DocumentMimeType, type ExtractAutoVariables, type FeatureName, FlowInputValidationError, type InputRequirements, type ModelMetadata, type ModelQueryFilter, type NormalizedCapabilities, type NormalizedFeatures, type NormalizedProviderMetadata, type OCRProviderConfig, type OutputFormatSupport, type ParseAutoVariables, type PromptVariables, type ProviderConfig, type ProviderInputType, type ProviderInstance, type ProviderMetadataWithModels, type ProviderQueryFilter, type ProviderRegistry, type ProviderSecrets, ProviderVendor, type ResolvedModelMetadata, type RetryConfig, type VLMProviderConfig, type WithRetryOptions, bufferToBase64, bufferToDataUri, buildProviderFromConfig, buildProvidersFromConfigs, calculateRetryDelay, clearCircuitBreakers, clearModelRegistry, clearProviderRegistry, createCircuitBreaker, defineMarkerProvider, defineSuryaProvider, defineVLMProvider, detectDocumentType, detectMimeTypeFromBase64, detectMimeTypeFromBase64Async, detectMimeTypeFromBytes, extractBase64, extractStatusCode, getAllModels, getAllProviders, getCheapestProviderFor, getCircuitBreaker, getModelsForNode, getProviderById, getProvidersBySource, getProvidersForLargeFiles, getProvidersForMimeType, isPDFDocument, isRetryableError, parseRetryAfter, queryModels, queryProviders, registerProviderMetadata, registerProviderWithModels, resolveDocument, resolveModelMetadata, validateFlowInputFormat, validateMimeType, validateMimeTypeAsync, withRetry };
+export { type AcceptedMimeType, AccessMethod, type AllAutoVariables, type AutoVariablesForNode, type BaseProviderConfig, type CategorizeAutoVariables, type CircuitBreaker, type CircuitBreakerConfig, type CircuitBreakerState, DEFAULT_CIRCUIT_BREAKER_CONFIG, DEFAULT_RETRY_CONFIG, type DerivedFeatureOptions, type DocumentMimeType, type ExtractAutoVariables, type FeatureName, type FeatureStatus, FlowInputValidationError, type InputRequirements, type ModelMetadata, type ModelQueryFilter, type NormalizedCapabilities, type NormalizedFeatures, type NormalizedProviderMetadata, type OCRProviderConfig, type OutputFormatSupport, type PageIndexing, type ParseAutoVariables, type PromptVariables, type ProviderConfig, type ProviderInputType, type ProviderInstance, type ProviderMetadataWithModels, type ProviderQueryFilter, type ProviderRegistry, type ProviderSecrets, ProviderVendor, type ResolvedModelMetadata, type RetryConfig, type TransformedOptions, type VLMProviderConfig, type WithRetryOptions, bufferToBase64, bufferToDataUri, buildProviderFromConfig, buildProvidersFromConfigs, calculateRetryDelay, clearCircuitBreakers, clearModelRegistry, clearProviderRegistry, createCircuitBreaker, defineMarkerProvider, defineSuryaProvider, defineVLMProvider, detectDocumentType, detectMimeTypeFromBase64, detectMimeTypeFromBase64Async, detectMimeTypeFromBytes, extractBase64, extractStatusCode, getAllModels, getAllProviders, getCheapestProviderFor, getCircuitBreaker, getModelsForNode, getPageIndexing, getProviderById, getProvidersBySource, getProvidersForLargeFiles, getProvidersForMimeType, isFeatureEnabled, isPDFDocument, isRetryableError, parseRetryAfter, queryModels, queryProviders, registerProviderMetadata, registerProviderWithModels, requiresMaxPagesTransformation, resolveDocument, resolveModelMetadata, transformDerivedFeatures, validateFlowInputFormat, validateMimeType, validateMimeTypeAsync, withRetry };

package/dist/index.js CHANGED Viewed

@@ -1398,6 +1398,9 @@ function createIdentity(provider, model, opts) {
 }
 // src/provider-query.ts
+function isFeatureEnabled(status) {
+  return status === true || status === "deprecated" || status === "derived";
+}
 var providerRegistry = /* @__PURE__ */ new Map();
 function registerProviderMetadata(source, metadata, normalizer) {
   const normalized = /* @__PURE__ */ new Map();
@@ -1482,7 +1485,7 @@ function queryProviders(filter = {}) {
   }
   if (filter.hasFeatures && filter.hasFeatures.length > 0) {
     providers = providers.filter(
-      (p) => filter.hasFeatures.every((feature) => p.features[feature] === true)
+      (p) => filter.hasFeatures.every((feature) => isFeatureEnabled(p.features[feature]))
     );
   }
   if (filter.outputFormat) {
@@ -1581,9 +1584,25 @@ function defaultNormalizer(id, data, source) {
     tableMerging: false,
     confidence: false,
     boundingBoxes: false,
+    imageBoundingBoxes: false,
     schemaValidation: false,
     handwrittenText: false,
     headerFooterExtraction: false,
+    // Extended features
+    embedOptimized: false,
+    passwordProtected: false,
+    contentFiltering: false,
+    ocrMode: false,
+    webhookCallback: false,
+    mediaResolution: false,
+    changeTracking: false,
+    hyperlinkExtraction: false,
+    chartUnderstanding: false,
+    imageCaptions: false,
+    signatureExtraction: false,
+    commentExtraction: false,
+    highlightExtraction: false,
+    figureSummaries: false,
     outputFormats: defaultOutputFormats
   };
   return {
@@ -1638,10 +1657,12 @@ function normalizeLLMProvider(id, d) {
     html: true,
     json: d.capabilities?.supportsStructuredOutput ?? true
   };
+  const vendor = d.vendor ?? id;
   const features = {
-    maxPages: d.inputFormats?.pdfs?.maxPages !== void 0,
-    pageRange: true,
-    // LLMs can handle page ranges
+    maxPages: "derived",
+    // SDK can limit via pre-processing
+    pageRange: false,
+    // No native API support - LLMs receive full text
     languageHints: false,
     // Not applicable to LLMs
     processingModes: false,
@@ -1654,8 +1675,8 @@ function normalizeLLMProvider(id, d) {
     // LLMs don't extract images
     pageMarkers: false,
     // LLMs don't add page markers
-    citations: false,
-    // Most LLMs don't have native citations (Anthropic has different API)
+    citations: vendor === "anthropic" ? true : false,
+    // Anthropic has Citations API
     chunking: false,
     // LLMs don't do chunking
     segmentation: false,
@@ -1669,15 +1690,32 @@ function normalizeLLMProvider(id, d) {
     // LLMs don't provide confidence scores
     boundingBoxes: false,
     // LLMs don't provide bounding boxes
+    imageBoundingBoxes: false,
+    // LLMs don't provide image bounding boxes (Gemini 2.0+ can via specific prompting, but not a simple toggle)
     schemaValidation: d.capabilities?.supportsStructuredOutput ?? false,
     // Some LLMs support schema validation
     handwrittenText: false,
     // Not specific to LLMs
     headerFooterExtraction: false,
     // LLMs don't extract header/footer separately
+    // Extended features
+    embedOptimized: false,
+    passwordProtected: false,
+    contentFiltering: false,
+    ocrMode: false,
+    webhookCallback: false,
+    mediaResolution: vendor === "google" ? true : false,
+    // Google Gemini has mediaResolution
+    changeTracking: false,
+    hyperlinkExtraction: false,
+    chartUnderstanding: false,
+    imageCaptions: false,
+    signatureExtraction: false,
+    commentExtraction: false,
+    highlightExtraction: false,
+    figureSummaries: false,
     outputFormats
   };
-  const vendor = d.vendor ?? id;
   return {
     id,
     name: d.name ?? id,
@@ -1698,7 +1736,8 @@ function normalizeLLMProvider(id, d) {
       supportsStructuredOutput: d.capabilities?.supportsStructuredOutput ?? false,
       // NEW capabilities
       supportsPrompts: true,
-      supportsCitations: false,
+      supportsCitations: vendor === "anthropic",
+      // Anthropic has Citations API
       supportsChunking: false,
       supportsImageExtraction: false,
       supportsPageMarkers: false,
@@ -1745,6 +1784,8 @@ function normalizeLLMProvider(id, d) {
 function normalizeDatalabProvider(id, d) {
   const opts = d.supportedOptions ?? {};
   const isVLM = d.type === "VLM";
+  const isMarkerOCR = id === "marker-ocr" || id.includes("marker-ocr");
+  const isMarkerVLM = id === "marker-vlm" || id.includes("marker-vlm");
   const model = d.model ?? id;
   const outputFormats = {
     text: true,
@@ -1755,35 +1796,61 @@ function normalizeDatalabProvider(id, d) {
   const features = {
     maxPages: opts.maxPages ?? false,
     pageRange: opts.pageRange ?? false,
-    languageHints: opts.langs ?? false,
-    // maps from 'langs'
+    languageHints: opts.langs ? "deprecated" : false,
+    // API ignores, handled automatically
     processingModes: opts.mode ?? false,
     agenticMode: false,
     // Datalab doesn't have agentic mode
-    customPrompts: opts.blockCorrectionPrompt ?? false,
+    customPrompts: opts.blockCorrectionPrompt ? "deprecated" : false,
+    // Not currently supported
     imageExtraction: opts.extractImages ?? false,
     pageMarkers: opts.paginate ?? false,
     // maps from 'paginate'
-    citations: opts.citations ?? false,
+    citations: isMarkerVLM ? true : false,
+    // Marker VLM has citations
     chunking: false,
     // Datalab doesn't have chunking
     segmentation: opts.segmentation ?? false,
-    stripExistingOCR: opts.stripExistingOCR ?? false,
-    formatLines: opts.formatLines ?? false,
-    forceOCR: true,
-    // Datalab supports force_ocr
+    stripExistingOCR: opts.stripExistingOCR ? "deprecated" : false,
+    // Managed automatically
+    formatLines: opts.formatLines ? "deprecated" : false,
+    // Handled automatically
+    forceOCR: "deprecated",
+    // DEPRECATED: force_ocr param has no effect per API docs
     tableOutputFormats: false,
     tableMerging: false,
     confidence: false,
     // Datalab doesn't provide confidence scores
     boundingBoxes: d.outputFormat?.features?.boundingBoxes ?? true,
-    // Datalab provides bounding boxes
+    // Datalab Surya provides text bboxes
+    imageBoundingBoxes: isMarkerOCR || isMarkerVLM ? true : false,
+    // Marker extracts images with bboxes
     schemaValidation: isVLM,
     // VLM providers support schema validation
     handwrittenText: true,
     // Datalab handles handwritten text
     headerFooterExtraction: false,
     // Datalab has issues with header/footer extraction
+    // Extended features
+    embedOptimized: false,
+    passwordProtected: false,
+    contentFiltering: false,
+    ocrMode: false,
+    webhookCallback: true,
+    // Datalab supports webhook callbacks
+    mediaResolution: false,
+    changeTracking: true,
+    // Datalab marker_extras supports track_changes
+    hyperlinkExtraction: isMarkerOCR || isMarkerVLM,
+    // Datalab extras=extract_links
+    chartUnderstanding: isMarkerOCR || isMarkerVLM,
+    // Datalab extras=chart_understanding
+    imageCaptions: isMarkerOCR || isMarkerVLM,
+    // Datalab disable_image_captions param
+    signatureExtraction: false,
+    commentExtraction: false,
+    highlightExtraction: false,
+    figureSummaries: false,
     outputFormats
   };
   return {
@@ -1852,6 +1919,7 @@ function normalizeReductoProvider(id, d) {
   const opts = d.supportedOptions ?? {};
   const isVLM = d.type === "VLM";
   const isExtract = d.compatibleNodes?.extract === true;
+  const isParse = d.compatibleNodes?.parse === true;
   const model = d.model ?? "v1";
   const outputFormats = {
     text: d.outputFormat?.features?.textLines ?? true,
@@ -1861,10 +1929,11 @@ function normalizeReductoProvider(id, d) {
     json: d.outputFormat?.features?.structuredJSON ?? isExtract
   };
   const features = {
-    maxPages: opts.maxPages ?? false,
+    maxPages: opts.pageRange ?? false ? "derived" : false,
+    // SDK derives from pageRange (1-indexed)
     pageRange: opts.pageRange ?? false,
-    languageHints: opts.langs ?? false,
-    // Reducto doesn't support langs
+    languageHints: false,
+    // Reducto doesn't support language hints
     processingModes: false,
     // Reducto uses agentic instead
     agenticMode: opts.mode ?? false,
@@ -1887,14 +1956,44 @@ function normalizeReductoProvider(id, d) {
     // Parse has mergeTables
     confidence: opts.confidence ?? d.outputFormat?.features?.confidence ?? false,
     // Reducto Parse has confidence
-    boundingBoxes: d.outputFormat?.features?.boundingBoxes ?? d.compatibleNodes?.parse ?? false,
-    // Reducto Parse has bounding boxes
+    boundingBoxes: d.outputFormat?.features?.boundingBoxes ?? isParse,
+    // Reducto Parse has text bounding boxes
+    imageBoundingBoxes: isParse ? true : false,
+    // Reducto Parse has figure bounding boxes
     schemaValidation: d.outputFormat?.features?.schemaValidation ?? isExtract,
     // Extract has schema validation
     handwrittenText: false,
     // Reducto doesn't specifically advertise handwriting
     headerFooterExtraction: true,
     // Reducto has Header/Footer block types
+    // Extended features
+    embedOptimized: isParse,
+    // Reducto Parse supports retrieval.embedding_optimized: true
+    passwordProtected: true,
+    // Reducto handles encrypted PDFs
+    contentFiltering: true,
+    // Reducto can filter block types
+    ocrMode: opts.ocrSystem ?? false,
+    // Reducto has ocr_system selection
+    webhookCallback: true,
+    // Reducto supports webhook callbacks
+    mediaResolution: false,
+    changeTracking: true,
+    // Reducto tracks changes in Word docs
+    hyperlinkExtraction: true,
+    // Reducto extracts hyperlinks via formatting.include
+    chartUnderstanding: isParse,
+    // Reducto enhance.agentic[].advanced_chart_agent for figures
+    imageCaptions: false,
+    // Not available in Reducto
+    signatureExtraction: false,
+    // NOT supported - formatting.include only accepts: change_tracking, highlight, comments, hyperlinks
+    commentExtraction: isParse || isExtract,
+    // Reducto formatting.include: ["comments"]
+    highlightExtraction: isParse || isExtract,
+    // Reducto formatting.include: ["highlight"]
+    figureSummaries: isParse,
+    // Reducto enhance.summarize_figures
     outputFormats
   };
   return {
@@ -1980,7 +2079,8 @@ function normalizeUnsiloedProvider(id, d) {
     // Unsiloed doesn't have page range option
     languageHints: false,
     // Unsiloed doesn't support language hints
-    processingModes: d.capabilities?.specialFeatures?.includes("YOLO segmentation") ?? false,
+    processingModes: false,
+    // Unsiloed doesn't have fast/balanced/high_accuracy modes like Datalab
     agenticMode: false,
     // Unsiloed doesn't have agentic mode
     customPrompts: false,
@@ -2002,14 +2102,40 @@ function normalizeUnsiloedProvider(id, d) {
     tableMerging: false,
     confidence: d.outputFormat?.features?.confidence ?? false,
     // Unsiloed may provide confidence
-    boundingBoxes: d.outputFormat?.features?.boundingBoxes ?? false,
-    // Unsiloed may provide bounding boxes
+    boundingBoxes: d.outputFormat?.features?.boundingBoxes ?? isParse,
+    // Unsiloed Parse has bounding boxes
+    imageBoundingBoxes: false,
+    // Unsiloed doesn't return image-specific bboxes
     schemaValidation: isExtract,
     // Extract supports schema validation
-    handwrittenText: false,
-    // Unsiloed doesn't specifically advertise handwriting
+    handwrittenText: d.capabilities?.specialFeatures?.includes("handwritten text") ?? false,
+    // Parse supports handwriting
     headerFooterExtraction: false,
     // Unsiloed doesn't extract header/footer separately
+    // Extended features
+    embedOptimized: false,
+    passwordProtected: false,
+    contentFiltering: isParse,
+    // Parse supports keep_segment_types: ["table", "picture", "formula", "text"]
+    ocrMode: isParse,
+    // Parse endpoint supports ocr_mode: 'auto_ocr' | 'full_ocr'
+    webhookCallback: false,
+    // Unsiloed is synchronous
+    mediaResolution: false,
+    changeTracking: false,
+    hyperlinkExtraction: false,
+    chartUnderstanding: false,
+    // Not available in Unsiloed
+    imageCaptions: false,
+    // Not available in Unsiloed
+    signatureExtraction: false,
+    // Not available in Unsiloed
+    commentExtraction: false,
+    // Not available in Unsiloed
+    highlightExtraction: false,
+    // Not available in Unsiloed
+    figureSummaries: false,
+    // Not available in Unsiloed
     outputFormats
   };
   return {
@@ -2038,7 +2164,8 @@ function normalizeUnsiloedProvider(id, d) {
       supportsImageExtraction: false,
       supportsPageMarkers: false,
       supportsLanguageHints: false,
-      supportsProcessingModes: d.capabilities?.specialFeatures?.includes("YOLO segmentation") ?? false,
+      supportsProcessingModes: false,
+      // Unsiloed doesn't have fast/balanced/high_accuracy modes
       supportsSegmentation: isSplit || isCategorize,
       outputFormats
     },
@@ -2256,7 +2383,7 @@ function matchesModelFilter(model, filter) {
   }
   if (filter.hasFeatures && filter.hasFeatures.length > 0) {
     for (const feature of filter.hasFeatures) {
-      if (model.features[feature] !== true) {
+      if (!isFeatureEnabled(model.features[feature])) {
         return false;
       }
     }
@@ -2299,6 +2426,44 @@ function getAllModels() {
 function clearModelRegistry() {
   modelRegistry.clear();
 }
+var PAGE_INDEXING = {
+  datalab: "0-indexed",
+  reducto: "1-indexed",
+  mistral: "0-indexed",
+  unsiloed: "1-indexed",
+  // Default assumption
+  llm: "1-indexed"
+  // N/A but default
+};
+function getPageIndexing(provider) {
+  const source = typeof provider === "string" ? provider : provider.source;
+  return PAGE_INDEXING[source] ?? "1-indexed";
+}
+function transformDerivedFeatures(options, provider) {
+  const { maxPages, pageRange, ...remainingOptions } = options;
+  const result = { remainingOptions };
+  if (pageRange !== void 0) {
+    result.page_range = pageRange;
+    return result;
+  }
+  if (maxPages !== void 0 && provider.features.maxPages === "derived") {
+    const indexing = getPageIndexing(provider);
+    if (indexing === "0-indexed") {
+      result.page_range = `0-${maxPages - 1}`;
+      if (provider.source === "mistral") {
+        result.pages = Array.from({ length: maxPages }, (_, i) => i);
+      }
+    } else {
+      result.page_range = `1-${maxPages}`;
+    }
+  } else if (maxPages !== void 0 && isFeatureEnabled(provider.features.maxPages)) {
+    result.remainingOptions.maxPages = maxPages;
+  }
+  return result;
+}
+function requiresMaxPagesTransformation(provider) {
+  return provider.features.maxPages === "derived";
+}
 function normalizeMistralProvider(id, d) {
   const opts = d.supportedOptions ?? {};
   const isVLM = d.type === "VLM";
@@ -2314,7 +2479,7 @@ function normalizeMistralProvider(id, d) {
   const features = {
     maxPages: d.inputFormats?.maxPages !== void 0,
     pageRange: true,
-    // Mistral supports pages param: "0-5" or [0,2,5]
+    // Mistral supports pages param: "0-5" or [0,2,5] (0-indexed)
     languageHints: false,
     // Mistral doesn't support language hints
     processingModes: false,
@@ -2342,14 +2507,39 @@ function normalizeMistralProvider(id, d) {
     tableMerging: false,
     confidence: false,
     // Mistral doesn't provide confidence scores
-    boundingBoxes: d.outputFormat?.features?.boundingBoxes ?? false,
-    // NO text-level bboxes
+    boundingBoxes: false,
+    // Mistral does NOT provide text-level bounding boxes
+    imageBoundingBoxes: true,
+    // Mistral provides image/figure bounding boxes only
     schemaValidation: d.outputFormat?.features?.schemaValidation ?? isVLM,
     // VLM supports schema
     handwrittenText: d.outputFormat?.features?.handwrittenText ?? true,
     // Excellent handwriting support
     headerFooterExtraction: opts.extractHeader ?? opts.extractFooter ?? false,
     // extract_header/extract_footer
+    // Extended features
+    embedOptimized: false,
+    passwordProtected: false,
+    contentFiltering: false,
+    ocrMode: false,
+    webhookCallback: false,
+    // Mistral is synchronous
+    mediaResolution: false,
+    changeTracking: false,
+    hyperlinkExtraction: true,
+    // Response pages[].hyperlinks[] auto-extracted
+    chartUnderstanding: false,
+    // Not available as separate feature in Mistral
+    imageCaptions: false,
+    // Not available in Mistral
+    signatureExtraction: false,
+    // Not available in Mistral
+    commentExtraction: false,
+    // Not available in Mistral
+    highlightExtraction: false,
+    // Not available in Mistral
+    figureSummaries: false,
+    // Not available in Mistral
     outputFormats
   };
   return {
@@ -2366,8 +2556,8 @@ function normalizeMistralProvider(id, d) {
     capabilities: {
       supportsImages: d.capabilities?.supportsImages ?? true,
       supportsPDFs: d.capabilities?.supportsPDFs ?? true,
-      supportsDocuments: d.capabilities?.supportsDocuments ?? false,
-      // DOCX/PPTX has known issues
+      supportsDocuments: d.capabilities?.supportsDocuments ?? true,
+      // Supports DOCX, PPTX, TXT, EPUB, RTF, ODT, etc. (NOT XLSX)
       supportsReasoning: false,
       // OCR 3 doesn't do reasoning
       supportsStructuredOutput: d.capabilities?.supportsStructuredOutput ?? isVLM,
@@ -2635,6 +2825,7 @@ export {
   getNodeTypeName,
   getPDFPageCount,
   getPageCountMetadata,
+  getPageIndexing,
   getProviderById,
   getProvidersBySource,
   getProvidersForLargeFiles,
@@ -2642,6 +2833,7 @@ export {
   getSuggestedConnections,
   getTotalPageCount,
   getValidForEachStarters,
+  isFeatureEnabled,
   isLocalEndpoint,
   isPDFDocument,
   isRetryableError,
@@ -2653,11 +2845,13 @@ export {
   queryProviders,
   registerProviderMetadata,
   registerProviderWithModels,
+  requiresMaxPagesTransformation,
   resolveDocument,
   resolveModelMetadata,
   runPipeline,
   splitPDFIntoChunks,
   toProviderString,
+  transformDerivedFeatures,
   validateFlowInputFormat,
   validateJson,
   validateMimeType,