npm - @doclo/core - Versions diffs - 0.1.11 → 0.2.0 - Mend

@doclo/core 0.1.11 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/dist/index.d.ts +131 -25
package/dist/index.js +351 -29
package/dist/index.js.map +1 -1
package/dist/internal/validation-utils.d.ts +1 -1
package/dist/pdf-utils.d.ts +1 -1
package/dist/{validation-D_EcHqPl.d.ts → validation-wlK06puw.d.ts} +1 -1
package/dist/validation.d.ts +1 -1
package/package.json +1 -1

package/dist/index.js CHANGED Viewed

@@ -1398,6 +1398,9 @@ function createIdentity(provider, model, opts) {
 }
 // src/provider-query.ts
+function isFeatureEnabled(status) {
+  return status === true || status === "deprecated" || status === "derived";
+}
 var providerRegistry = /* @__PURE__ */ new Map();
 function registerProviderMetadata(source, metadata, normalizer) {
   const normalized = /* @__PURE__ */ new Map();
@@ -1482,7 +1485,7 @@ function queryProviders(filter = {}) {
   }
   if (filter.hasFeatures && filter.hasFeatures.length > 0) {
     providers = providers.filter(
-      (p) => filter.hasFeatures.every((feature) => p.features[feature] === true)
+      (p) => filter.hasFeatures.every((feature) => isFeatureEnabled(p.features[feature]))
     );
   }
   if (filter.outputFormat) {
@@ -1558,6 +1561,8 @@ function defaultNormalizer(id, data, source) {
     return normalizeReductoProvider(id, d);
   } else if (source === "unsiloed") {
     return normalizeUnsiloedProvider(id, d);
+  } else if (source === "mistral") {
+    return normalizeMistralProvider(id, d);
   }
   const defaultOutputFormats = { text: true, markdown: false, html: false, json: false };
   const defaultFeatures = {
@@ -1579,8 +1584,25 @@ function defaultNormalizer(id, data, source) {
     tableMerging: false,
     confidence: false,
     boundingBoxes: false,
+    imageBoundingBoxes: false,
     schemaValidation: false,
     handwrittenText: false,
+    headerFooterExtraction: false,
+    // Extended features
+    embedOptimized: false,
+    passwordProtected: false,
+    contentFiltering: false,
+    ocrMode: false,
+    webhookCallback: false,
+    mediaResolution: false,
+    changeTracking: false,
+    hyperlinkExtraction: false,
+    chartUnderstanding: false,
+    imageCaptions: false,
+    signatureExtraction: false,
+    commentExtraction: false,
+    highlightExtraction: false,
+    figureSummaries: false,
     outputFormats: defaultOutputFormats
   };
   return {
@@ -1635,10 +1657,12 @@ function normalizeLLMProvider(id, d) {
     html: true,
     json: d.capabilities?.supportsStructuredOutput ?? true
   };
+  const vendor = d.vendor ?? id;
   const features = {
-    maxPages: d.inputFormats?.pdfs?.maxPages !== void 0,
-    pageRange: true,
-    // LLMs can handle page ranges
+    maxPages: "derived",
+    // SDK can limit via pre-processing
+    pageRange: false,
+    // No native API support - LLMs receive full text
     languageHints: false,
     // Not applicable to LLMs
     processingModes: false,
@@ -1651,8 +1675,8 @@ function normalizeLLMProvider(id, d) {
     // LLMs don't extract images
     pageMarkers: false,
     // LLMs don't add page markers
-    citations: false,
-    // Most LLMs don't have native citations (Anthropic has different API)
+    citations: vendor === "anthropic" ? true : false,
+    // Anthropic has Citations API
     chunking: false,
     // LLMs don't do chunking
     segmentation: false,
@@ -1666,13 +1690,32 @@ function normalizeLLMProvider(id, d) {
     // LLMs don't provide confidence scores
     boundingBoxes: false,
     // LLMs don't provide bounding boxes
+    imageBoundingBoxes: false,
+    // LLMs don't provide image bounding boxes (Gemini 2.0+ can via specific prompting, but not a simple toggle)
     schemaValidation: d.capabilities?.supportsStructuredOutput ?? false,
     // Some LLMs support schema validation
     handwrittenText: false,
     // Not specific to LLMs
+    headerFooterExtraction: false,
+    // LLMs don't extract header/footer separately
+    // Extended features
+    embedOptimized: false,
+    passwordProtected: false,
+    contentFiltering: false,
+    ocrMode: false,
+    webhookCallback: false,
+    mediaResolution: vendor === "google" ? true : false,
+    // Google Gemini has mediaResolution
+    changeTracking: false,
+    hyperlinkExtraction: false,
+    chartUnderstanding: false,
+    imageCaptions: false,
+    signatureExtraction: false,
+    commentExtraction: false,
+    highlightExtraction: false,
+    figureSummaries: false,
     outputFormats
   };
-  const vendor = d.vendor ?? id;
   return {
     id,
     name: d.name ?? id,
@@ -1693,7 +1736,8 @@ function normalizeLLMProvider(id, d) {
       supportsStructuredOutput: d.capabilities?.supportsStructuredOutput ?? false,
       // NEW capabilities
       supportsPrompts: true,
-      supportsCitations: false,
+      supportsCitations: vendor === "anthropic",
+      // Anthropic has Citations API
       supportsChunking: false,
       supportsImageExtraction: false,
       supportsPageMarkers: false,
@@ -1740,6 +1784,8 @@ function normalizeLLMProvider(id, d) {
 function normalizeDatalabProvider(id, d) {
   const opts = d.supportedOptions ?? {};
   const isVLM = d.type === "VLM";
+  const isMarkerOCR = id === "marker-ocr" || id.includes("marker-ocr");
+  const isMarkerVLM = id === "marker-vlm" || id.includes("marker-vlm");
   const model = d.model ?? id;
   const outputFormats = {
     text: true,
@@ -1750,33 +1796,61 @@ function normalizeDatalabProvider(id, d) {
   const features = {
     maxPages: opts.maxPages ?? false,
     pageRange: opts.pageRange ?? false,
-    languageHints: opts.langs ?? false,
-    // maps from 'langs'
+    languageHints: opts.langs ? "deprecated" : false,
+    // API ignores, handled automatically
     processingModes: opts.mode ?? false,
     agenticMode: false,
     // Datalab doesn't have agentic mode
-    customPrompts: opts.blockCorrectionPrompt ?? false,
+    customPrompts: opts.blockCorrectionPrompt ? "deprecated" : false,
+    // Not currently supported
     imageExtraction: opts.extractImages ?? false,
     pageMarkers: opts.paginate ?? false,
     // maps from 'paginate'
-    citations: opts.citations ?? false,
+    citations: isMarkerVLM ? true : false,
+    // Marker VLM has citations
     chunking: false,
     // Datalab doesn't have chunking
     segmentation: opts.segmentation ?? false,
-    stripExistingOCR: opts.stripExistingOCR ?? false,
-    formatLines: opts.formatLines ?? false,
-    forceOCR: true,
-    // Datalab supports force_ocr
+    stripExistingOCR: opts.stripExistingOCR ? "deprecated" : false,
+    // Managed automatically
+    formatLines: opts.formatLines ? "deprecated" : false,
+    // Handled automatically
+    forceOCR: "deprecated",
+    // DEPRECATED: force_ocr param has no effect per API docs
     tableOutputFormats: false,
     tableMerging: false,
     confidence: false,
     // Datalab doesn't provide confidence scores
     boundingBoxes: d.outputFormat?.features?.boundingBoxes ?? true,
-    // Datalab provides bounding boxes
+    // Datalab Surya provides text bboxes
+    imageBoundingBoxes: isMarkerOCR || isMarkerVLM ? true : false,
+    // Marker extracts images with bboxes
     schemaValidation: isVLM,
     // VLM providers support schema validation
     handwrittenText: true,
     // Datalab handles handwritten text
+    headerFooterExtraction: false,
+    // Datalab has issues with header/footer extraction
+    // Extended features
+    embedOptimized: false,
+    passwordProtected: false,
+    contentFiltering: false,
+    ocrMode: false,
+    webhookCallback: true,
+    // Datalab supports webhook callbacks
+    mediaResolution: false,
+    changeTracking: true,
+    // Datalab marker_extras supports track_changes
+    hyperlinkExtraction: isMarkerOCR || isMarkerVLM,
+    // Datalab extras=extract_links
+    chartUnderstanding: isMarkerOCR || isMarkerVLM,
+    // Datalab extras=chart_understanding
+    imageCaptions: isMarkerOCR || isMarkerVLM,
+    // Datalab disable_image_captions param
+    signatureExtraction: false,
+    commentExtraction: false,
+    highlightExtraction: false,
+    figureSummaries: false,
     outputFormats
   };
   return {
@@ -1845,6 +1919,7 @@ function normalizeReductoProvider(id, d) {
   const opts = d.supportedOptions ?? {};
   const isVLM = d.type === "VLM";
   const isExtract = d.compatibleNodes?.extract === true;
+  const isParse = d.compatibleNodes?.parse === true;
   const model = d.model ?? "v1";
   const outputFormats = {
     text: d.outputFormat?.features?.textLines ?? true,
@@ -1854,10 +1929,11 @@ function normalizeReductoProvider(id, d) {
     json: d.outputFormat?.features?.structuredJSON ?? isExtract
   };
   const features = {
-    maxPages: opts.maxPages ?? false,
+    maxPages: opts.pageRange ?? false ? "derived" : false,
+    // SDK derives from pageRange (1-indexed)
     pageRange: opts.pageRange ?? false,
-    languageHints: opts.langs ?? false,
-    // Reducto doesn't support langs
+    languageHints: false,
+    // Reducto doesn't support language hints
     processingModes: false,
     // Reducto uses agentic instead
     agenticMode: opts.mode ?? false,
@@ -1880,12 +1956,44 @@ function normalizeReductoProvider(id, d) {
     // Parse has mergeTables
     confidence: opts.confidence ?? d.outputFormat?.features?.confidence ?? false,
     // Reducto Parse has confidence
-    boundingBoxes: d.outputFormat?.features?.boundingBoxes ?? d.compatibleNodes?.parse ?? false,
-    // Reducto Parse has bounding boxes
+    boundingBoxes: d.outputFormat?.features?.boundingBoxes ?? isParse,
+    // Reducto Parse has text bounding boxes
+    imageBoundingBoxes: isParse ? true : false,
+    // Reducto Parse has figure bounding boxes
     schemaValidation: d.outputFormat?.features?.schemaValidation ?? isExtract,
     // Extract has schema validation
     handwrittenText: false,
     // Reducto doesn't specifically advertise handwriting
+    headerFooterExtraction: true,
+    // Reducto has Header/Footer block types
+    // Extended features
+    embedOptimized: isParse,
+    // Reducto Parse supports retrieval.embedding_optimized: true
+    passwordProtected: true,
+    // Reducto handles encrypted PDFs
+    contentFiltering: true,
+    // Reducto can filter block types
+    ocrMode: opts.ocrSystem ?? false,
+    // Reducto has ocr_system selection
+    webhookCallback: true,
+    // Reducto supports webhook callbacks
+    mediaResolution: false,
+    changeTracking: true,
+    // Reducto tracks changes in Word docs
+    hyperlinkExtraction: true,
+    // Reducto extracts hyperlinks via formatting.include
+    chartUnderstanding: isParse,
+    // Reducto enhance.agentic[].advanced_chart_agent for figures
+    imageCaptions: false,
+    // Not available in Reducto
+    signatureExtraction: false,
+    // NOT supported - formatting.include only accepts: change_tracking, highlight, comments, hyperlinks
+    commentExtraction: isParse || isExtract,
+    // Reducto formatting.include: ["comments"]
+    highlightExtraction: isParse || isExtract,
+    // Reducto formatting.include: ["highlight"]
+    figureSummaries: isParse,
+    // Reducto enhance.summarize_figures
     outputFormats
   };
   return {
@@ -1971,7 +2079,8 @@ function normalizeUnsiloedProvider(id, d) {
     // Unsiloed doesn't have page range option
     languageHints: false,
     // Unsiloed doesn't support language hints
-    processingModes: d.capabilities?.specialFeatures?.includes("YOLO segmentation") ?? false,
+    processingModes: false,
+    // Unsiloed doesn't have fast/balanced/high_accuracy modes like Datalab
     agenticMode: false,
     // Unsiloed doesn't have agentic mode
     customPrompts: false,
@@ -1993,12 +2102,40 @@ function normalizeUnsiloedProvider(id, d) {
     tableMerging: false,
     confidence: d.outputFormat?.features?.confidence ?? false,
     // Unsiloed may provide confidence
-    boundingBoxes: d.outputFormat?.features?.boundingBoxes ?? false,
-    // Unsiloed may provide bounding boxes
+    boundingBoxes: d.outputFormat?.features?.boundingBoxes ?? isParse,
+    // Unsiloed Parse has bounding boxes
+    imageBoundingBoxes: false,
+    // Unsiloed doesn't return image-specific bboxes
     schemaValidation: isExtract,
     // Extract supports schema validation
-    handwrittenText: false,
-    // Unsiloed doesn't specifically advertise handwriting
+    handwrittenText: d.capabilities?.specialFeatures?.includes("handwritten text") ?? false,
+    // Parse supports handwriting
+    headerFooterExtraction: false,
+    // Unsiloed doesn't extract header/footer separately
+    // Extended features
+    embedOptimized: false,
+    passwordProtected: false,
+    contentFiltering: isParse,
+    // Parse supports keep_segment_types: ["table", "picture", "formula", "text"]
+    ocrMode: isParse,
+    // Parse endpoint supports ocr_mode: 'auto_ocr' | 'full_ocr'
+    webhookCallback: false,
+    // Unsiloed is synchronous
+    mediaResolution: false,
+    changeTracking: false,
+    hyperlinkExtraction: false,
+    chartUnderstanding: false,
+    // Not available in Unsiloed
+    imageCaptions: false,
+    // Not available in Unsiloed
+    signatureExtraction: false,
+    // Not available in Unsiloed
+    commentExtraction: false,
+    // Not available in Unsiloed
+    highlightExtraction: false,
+    // Not available in Unsiloed
+    figureSummaries: false,
+    // Not available in Unsiloed
     outputFormats
   };
   return {
@@ -2027,7 +2164,8 @@ function normalizeUnsiloedProvider(id, d) {
       supportsImageExtraction: false,
       supportsPageMarkers: false,
       supportsLanguageHints: false,
-      supportsProcessingModes: d.capabilities?.specialFeatures?.includes("YOLO segmentation") ?? false,
+      supportsProcessingModes: false,
+      // Unsiloed doesn't have fast/balanced/high_accuracy modes
       supportsSegmentation: isSplit || isCategorize,
       outputFormats
     },
@@ -2245,7 +2383,7 @@ function matchesModelFilter(model, filter) {
   }
   if (filter.hasFeatures && filter.hasFeatures.length > 0) {
     for (const feature of filter.hasFeatures) {
-      if (model.features[feature] !== true) {
+      if (!isFeatureEnabled(model.features[feature])) {
         return false;
       }
     }
@@ -2288,6 +2426,186 @@ function getAllModels() {
 function clearModelRegistry() {
   modelRegistry.clear();
 }
+var PAGE_INDEXING = {
+  datalab: "0-indexed",
+  reducto: "1-indexed",
+  mistral: "0-indexed",
+  unsiloed: "1-indexed",
+  // Default assumption
+  llm: "1-indexed"
+  // N/A but default
+};
+function getPageIndexing(provider) {
+  const source = typeof provider === "string" ? provider : provider.source;
+  return PAGE_INDEXING[source] ?? "1-indexed";
+}
+function transformDerivedFeatures(options, provider) {
+  const { maxPages, pageRange, ...remainingOptions } = options;
+  const result = { remainingOptions };
+  if (pageRange !== void 0) {
+    result.page_range = pageRange;
+    return result;
+  }
+  if (maxPages !== void 0 && provider.features.maxPages === "derived") {
+    const indexing = getPageIndexing(provider);
+    if (indexing === "0-indexed") {
+      result.page_range = `0-${maxPages - 1}`;
+      if (provider.source === "mistral") {
+        result.pages = Array.from({ length: maxPages }, (_, i) => i);
+      }
+    } else {
+      result.page_range = `1-${maxPages}`;
+    }
+  } else if (maxPages !== void 0 && isFeatureEnabled(provider.features.maxPages)) {
+    result.remainingOptions.maxPages = maxPages;
+  }
+  return result;
+}
+function requiresMaxPagesTransformation(provider) {
+  return provider.features.maxPages === "derived";
+}
+function normalizeMistralProvider(id, d) {
+  const opts = d.supportedOptions ?? {};
+  const isVLM = d.type === "VLM";
+  const isOCR = d.type === "OCR";
+  const model = d.model ?? id;
+  const outputFormats = {
+    text: true,
+    markdown: d.outputFormat?.features?.markdown ?? isOCR,
+    html: d.outputFormat?.features?.htmlTables ?? isOCR,
+    // OCR 3 can output HTML tables
+    json: d.outputFormat?.features?.structuredJSON ?? isVLM
+  };
+  const features = {
+    maxPages: d.inputFormats?.maxPages !== void 0,
+    pageRange: true,
+    // Mistral supports pages param: "0-5" or [0,2,5] (0-indexed)
+    languageHints: false,
+    // Mistral doesn't support language hints
+    processingModes: false,
+    // Mistral doesn't have processing modes
+    agenticMode: false,
+    // Mistral doesn't have agentic mode
+    customPrompts: false,
+    // Mistral OCR 3 doesn't support custom prompts
+    imageExtraction: opts.includeImageBase64 ?? false,
+    // Can include embedded images
+    pageMarkers: false,
+    // Mistral doesn't add page markers
+    citations: false,
+    // Mistral doesn't provide citations
+    chunking: false,
+    // Mistral doesn't do chunking
+    segmentation: false,
+    // Mistral doesn't do segmentation
+    stripExistingOCR: false,
+    formatLines: false,
+    forceOCR: true,
+    // OCR 3 always does OCR
+    tableOutputFormats: opts.tableFormat ?? isOCR,
+    // html or markdown table format
+    tableMerging: false,
+    confidence: false,
+    // Mistral doesn't provide confidence scores
+    boundingBoxes: false,
+    // Mistral does NOT provide text-level bounding boxes
+    imageBoundingBoxes: true,
+    // Mistral provides image/figure bounding boxes only
+    schemaValidation: d.outputFormat?.features?.schemaValidation ?? isVLM,
+    // VLM supports schema
+    handwrittenText: d.outputFormat?.features?.handwrittenText ?? true,
+    // Excellent handwriting support
+    headerFooterExtraction: opts.extractHeader ?? opts.extractFooter ?? false,
+    // extract_header/extract_footer
+    // Extended features
+    embedOptimized: false,
+    passwordProtected: false,
+    contentFiltering: false,
+    ocrMode: false,
+    webhookCallback: false,
+    // Mistral is synchronous
+    mediaResolution: false,
+    changeTracking: false,
+    hyperlinkExtraction: true,
+    // Response pages[].hyperlinks[] auto-extracted
+    chartUnderstanding: false,
+    // Not available as separate feature in Mistral
+    imageCaptions: false,
+    // Not available in Mistral
+    signatureExtraction: false,
+    // Not available in Mistral
+    commentExtraction: false,
+    // Not available in Mistral
+    highlightExtraction: false,
+    // Not available in Mistral
+    figureSummaries: false,
+    // Not available in Mistral
+    outputFormats
+  };
+  return {
+    id: d.id ?? id,
+    name: d.name ?? id,
+    source: "mistral",
+    type: d.type ?? "OCR",
+    // 3-layer identity
+    identity: {
+      provider: "mistral",
+      model,
+      method: "native"
+    },
+    capabilities: {
+      supportsImages: d.capabilities?.supportsImages ?? true,
+      supportsPDFs: d.capabilities?.supportsPDFs ?? true,
+      supportsDocuments: d.capabilities?.supportsDocuments ?? true,
+      // Supports DOCX, PPTX, TXT, EPUB, RTF, ODT, etc. (NOT XLSX)
+      supportsReasoning: false,
+      // OCR 3 doesn't do reasoning
+      supportsStructuredOutput: d.capabilities?.supportsStructuredOutput ?? isVLM,
+      // Extended capabilities
+      supportsPrompts: false,
+      supportsCitations: false,
+      supportsChunking: false,
+      supportsImageExtraction: opts.includeImageBase64 ?? false,
+      supportsPageMarkers: false,
+      supportsLanguageHints: false,
+      supportsProcessingModes: false,
+      supportsSegmentation: false,
+      outputFormats
+    },
+    features,
+    // Mistral providers always need raw document input
+    inputRequirements: {
+      inputType: d.inputRequirements?.inputType ?? "raw-document",
+      acceptedMethods: d.inputRequirements?.acceptedMethods ?? ["base64", "url"]
+    },
+    compatibleNodes: {
+      parse: d.compatibleNodes?.parse ?? isOCR,
+      extract: d.compatibleNodes?.extract ?? isVLM,
+      categorize: d.compatibleNodes?.categorize ?? false,
+      qualify: d.compatibleNodes?.qualify ?? false,
+      split: d.compatibleNodes?.split ?? false
+    },
+    inputFormats: {
+      imageMimeTypes: (d.inputFormats?.mimeTypes ?? []).filter((m) => m.startsWith("image/")),
+      documentMimeTypes: (d.inputFormats?.mimeTypes ?? []).filter((m) => !m.startsWith("image/")),
+      inputMethods: d.inputFormats?.inputMethods ?? ["base64", "url"],
+      maxFileSize: d.inputFormats?.maxFileSize ?? 50,
+      // 50MB limit
+      maxPages: d.inputFormats?.maxPages ?? 1e3
+    },
+    pricing: {
+      model: "per-page",
+      perPage: d.pricing?.perPage ?? 2e-3,
+      // $2/1000 pages
+      currency: "USD",
+      notes: d.pricing?.notes ?? "$2 per 1000 pages"
+    },
+    rateLimits: {
+      docsPerMinute: d.apiConfig?.rateLimit?.docsPerMinute
+    },
+    raw: d
+  };
+}
 // src/retry.ts
 var DEFAULT_RETRY_CONFIG = {
@@ -2507,6 +2825,7 @@ export {
   getNodeTypeName,
   getPDFPageCount,
   getPageCountMetadata,
+  getPageIndexing,
   getProviderById,
   getProvidersBySource,
   getProvidersForLargeFiles,
@@ -2514,6 +2833,7 @@ export {
   getSuggestedConnections,
   getTotalPageCount,
   getValidForEachStarters,
+  isFeatureEnabled,
   isLocalEndpoint,
   isPDFDocument,
   isRetryableError,
@@ -2525,11 +2845,13 @@ export {
   queryProviders,
   registerProviderMetadata,
   registerProviderWithModels,
+  requiresMaxPagesTransformation,
   resolveDocument,
   resolveModelMetadata,
   runPipeline,
   splitPDFIntoChunks,
   toProviderString,
+  transformDerivedFeatures,
   validateFlowInputFormat,
   validateJson,
   validateMimeType,