npm - @doclo/core - Versions diffs - 0.1.11 → 0.1.12 - Mend

@doclo/core 0.1.11 → 0.1.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/dist/index.d.ts +4 -2
package/dist/index.js +128 -0
package/dist/index.js.map +1 -1
package/dist/internal/validation-utils.d.ts +1 -1
package/dist/pdf-utils.d.ts +1 -1
package/dist/{validation-D_EcHqPl.d.ts → validation-wlK06puw.d.ts} +1 -1
package/dist/validation.d.ts +1 -1
package/package.json +1 -1

package/dist/index.d.ts CHANGED Viewed

@@ -1,5 +1,5 @@
-import { P as ProviderVendor, A as AccessMethod } from './validation-D_EcHqPl.js';
-export { G as AggregatedMetrics, B as BBox, r as CategorizeNodeConfig, t as ChunkMetadata, v as ChunkNodeConfig, u as ChunkOutput, n as CitationConfig, k as CitationSourceType, w as CombineNodeConfig, U as CompatibilityRule, C as ConsensusConfig, e as ConsensusMetadata, d as ConsensusRunResult, D as DocumentIR, b as DocumentIRExtras, y as EnhancedExtractionSchema, s as ExtractInputMode, E as ExtractNodeConfig, a0 as ExtractedImage, m as FieldCitation, F as FieldVotingDetails, H as FlowContext, a8 as FlowExecutionError, h as FlowInput, i as FlowInputValidation, j as FlowResult, a4 as FlowStepLocation, a9 as FlowValidationError, I as IRLine, a as IRPage, X as JSONSchemaNode, c as LLMJsonProvider, L as LLMProvider, _ as LanguageOptions, l as LineCitation, g as MaybeWithConsensusMetadata, M as MultimodalInput, aa as NODE_COMPATIBILITY_MATRIX, J as NodeCtx, Q as NodeDef, K as NodeTypeInfo, T as NodeTypeName, N as NormalizedBBox, O as OCRProvider, a1 as OCRProviderOptions, x as OutputNodeConfig, o as OutputWithCitations, f as OutputWithConsensus, Z as PageRangeOptions, p as ParseNodeConfig, Y as ProcessingMode, a3 as ProviderCitation, am as ProviderIdentity, aj as RESERVED_VARIABLES, R as ReasoningConfig, $ as SegmentationResult, S as SplitDocument, q as SplitNodeConfig, z as StepMetric, V as VLMProvider, a2 as VLMProviderOptions, W as ValidationResult, a5 as aggregateMetrics, ah as canStartForEachItemFlow, aq as createIdentity, al as extractErrorMessage, ad as getCompatibleTargets, ac as getNodeTypeInfo, ab as getNodeTypeName, ae as getSuggestedConnections, ag as getValidForEachStarters, ap as isLocalEndpoint, a6 as node, ao as parseProviderString, ak as protectReservedVariables, a7 as runPipeline, an as toProviderString, ai as validateJson, af as validateNodeConnection } from './validation-D_EcHqPl.js';
+import { P as ProviderVendor, A as AccessMethod } from './validation-wlK06puw.js';
+export { G as AggregatedMetrics, B as BBox, r as CategorizeNodeConfig, t as ChunkMetadata, v as ChunkNodeConfig, u as ChunkOutput, n as CitationConfig, k as CitationSourceType, w as CombineNodeConfig, U as CompatibilityRule, C as ConsensusConfig, e as ConsensusMetadata, d as ConsensusRunResult, D as DocumentIR, b as DocumentIRExtras, y as EnhancedExtractionSchema, s as ExtractInputMode, E as ExtractNodeConfig, a0 as ExtractedImage, m as FieldCitation, F as FieldVotingDetails, H as FlowContext, a8 as FlowExecutionError, h as FlowInput, i as FlowInputValidation, j as FlowResult, a4 as FlowStepLocation, a9 as FlowValidationError, I as IRLine, a as IRPage, X as JSONSchemaNode, c as LLMJsonProvider, L as LLMProvider, _ as LanguageOptions, l as LineCitation, g as MaybeWithConsensusMetadata, M as MultimodalInput, aa as NODE_COMPATIBILITY_MATRIX, J as NodeCtx, Q as NodeDef, K as NodeTypeInfo, T as NodeTypeName, N as NormalizedBBox, O as OCRProvider, a1 as OCRProviderOptions, x as OutputNodeConfig, o as OutputWithCitations, f as OutputWithConsensus, Z as PageRangeOptions, p as ParseNodeConfig, Y as ProcessingMode, a3 as ProviderCitation, am as ProviderIdentity, aj as RESERVED_VARIABLES, R as ReasoningConfig, $ as SegmentationResult, S as SplitDocument, q as SplitNodeConfig, z as StepMetric, V as VLMProvider, a2 as VLMProviderOptions, W as ValidationResult, a5 as aggregateMetrics, ah as canStartForEachItemFlow, aq as createIdentity, al as extractErrorMessage, ad as getCompatibleTargets, ac as getNodeTypeInfo, ab as getNodeTypeName, ae as getSuggestedConnections, ag as getValidForEachStarters, ap as isLocalEndpoint, a6 as node, ao as parseProviderString, ak as protectReservedVariables, a7 as runPipeline, an as toProviderString, ai as validateJson, af as validateNodeConnection } from './validation-wlK06puw.js';
 export { getDocumentPageCount, getPDFPageCount, getPageCountMetadata, getTotalPageCount, splitPDFIntoChunks } from './pdf-utils.js';
 /**
@@ -559,6 +559,8 @@ type NormalizedFeatures = {
     schemaValidation: boolean;
     /** Handwritten text recognition support */
     handwrittenText: boolean;
+    /** Separate header/footer extraction from main content */
+    headerFooterExtraction: boolean;
     /** Supported output formats */
     outputFormats: OutputFormatSupport;
 };

package/dist/index.js CHANGED Viewed

@@ -1558,6 +1558,8 @@ function defaultNormalizer(id, data, source) {
     return normalizeReductoProvider(id, d);
   } else if (source === "unsiloed") {
     return normalizeUnsiloedProvider(id, d);
+  } else if (source === "mistral") {
+    return normalizeMistralProvider(id, d);
   }
   const defaultOutputFormats = { text: true, markdown: false, html: false, json: false };
   const defaultFeatures = {
@@ -1581,6 +1583,7 @@ function defaultNormalizer(id, data, source) {
     boundingBoxes: false,
     schemaValidation: false,
     handwrittenText: false,
+    headerFooterExtraction: false,
     outputFormats: defaultOutputFormats
   };
   return {
@@ -1670,6 +1673,8 @@ function normalizeLLMProvider(id, d) {
     // Some LLMs support schema validation
     handwrittenText: false,
     // Not specific to LLMs
+    headerFooterExtraction: false,
+    // LLMs don't extract header/footer separately
     outputFormats
   };
   const vendor = d.vendor ?? id;
@@ -1777,6 +1782,8 @@ function normalizeDatalabProvider(id, d) {
     // VLM providers support schema validation
     handwrittenText: true,
     // Datalab handles handwritten text
+    headerFooterExtraction: false,
+    // Datalab has issues with header/footer extraction
     outputFormats
   };
   return {
@@ -1886,6 +1893,8 @@ function normalizeReductoProvider(id, d) {
     // Extract has schema validation
     handwrittenText: false,
     // Reducto doesn't specifically advertise handwriting
+    headerFooterExtraction: true,
+    // Reducto has Header/Footer block types
     outputFormats
   };
   return {
@@ -1999,6 +2008,8 @@ function normalizeUnsiloedProvider(id, d) {
     // Extract supports schema validation
     handwrittenText: false,
     // Unsiloed doesn't specifically advertise handwriting
+    headerFooterExtraction: false,
+    // Unsiloed doesn't extract header/footer separately
     outputFormats
   };
   return {
@@ -2288,6 +2299,123 @@ function getAllModels() {
 function clearModelRegistry() {
   modelRegistry.clear();
 }
+function normalizeMistralProvider(id, d) {
+  const opts = d.supportedOptions ?? {};
+  const isVLM = d.type === "VLM";
+  const isOCR = d.type === "OCR";
+  const model = d.model ?? id;
+  const outputFormats = {
+    text: true,
+    markdown: d.outputFormat?.features?.markdown ?? isOCR,
+    html: d.outputFormat?.features?.htmlTables ?? isOCR,
+    // OCR 3 can output HTML tables
+    json: d.outputFormat?.features?.structuredJSON ?? isVLM
+  };
+  const features = {
+    maxPages: d.inputFormats?.maxPages !== void 0,
+    pageRange: true,
+    // Mistral supports pages param: "0-5" or [0,2,5]
+    languageHints: false,
+    // Mistral doesn't support language hints
+    processingModes: false,
+    // Mistral doesn't have processing modes
+    agenticMode: false,
+    // Mistral doesn't have agentic mode
+    customPrompts: false,
+    // Mistral OCR 3 doesn't support custom prompts
+    imageExtraction: opts.includeImageBase64 ?? false,
+    // Can include embedded images
+    pageMarkers: false,
+    // Mistral doesn't add page markers
+    citations: false,
+    // Mistral doesn't provide citations
+    chunking: false,
+    // Mistral doesn't do chunking
+    segmentation: false,
+    // Mistral doesn't do segmentation
+    stripExistingOCR: false,
+    formatLines: false,
+    forceOCR: true,
+    // OCR 3 always does OCR
+    tableOutputFormats: opts.tableFormat ?? isOCR,
+    // html or markdown table format
+    tableMerging: false,
+    confidence: false,
+    // Mistral doesn't provide confidence scores
+    boundingBoxes: d.outputFormat?.features?.boundingBoxes ?? false,
+    // NO text-level bboxes
+    schemaValidation: d.outputFormat?.features?.schemaValidation ?? isVLM,
+    // VLM supports schema
+    handwrittenText: d.outputFormat?.features?.handwrittenText ?? true,
+    // Excellent handwriting support
+    headerFooterExtraction: opts.extractHeader ?? opts.extractFooter ?? false,
+    // extract_header/extract_footer
+    outputFormats
+  };
+  return {
+    id: d.id ?? id,
+    name: d.name ?? id,
+    source: "mistral",
+    type: d.type ?? "OCR",
+    // 3-layer identity
+    identity: {
+      provider: "mistral",
+      model,
+      method: "native"
+    },
+    capabilities: {
+      supportsImages: d.capabilities?.supportsImages ?? true,
+      supportsPDFs: d.capabilities?.supportsPDFs ?? true,
+      supportsDocuments: d.capabilities?.supportsDocuments ?? false,
+      // DOCX/PPTX has known issues
+      supportsReasoning: false,
+      // OCR 3 doesn't do reasoning
+      supportsStructuredOutput: d.capabilities?.supportsStructuredOutput ?? isVLM,
+      // Extended capabilities
+      supportsPrompts: false,
+      supportsCitations: false,
+      supportsChunking: false,
+      supportsImageExtraction: opts.includeImageBase64 ?? false,
+      supportsPageMarkers: false,
+      supportsLanguageHints: false,
+      supportsProcessingModes: false,
+      supportsSegmentation: false,
+      outputFormats
+    },
+    features,
+    // Mistral providers always need raw document input
+    inputRequirements: {
+      inputType: d.inputRequirements?.inputType ?? "raw-document",
+      acceptedMethods: d.inputRequirements?.acceptedMethods ?? ["base64", "url"]
+    },
+    compatibleNodes: {
+      parse: d.compatibleNodes?.parse ?? isOCR,
+      extract: d.compatibleNodes?.extract ?? isVLM,
+      categorize: d.compatibleNodes?.categorize ?? false,
+      qualify: d.compatibleNodes?.qualify ?? false,
+      split: d.compatibleNodes?.split ?? false
+    },
+    inputFormats: {
+      imageMimeTypes: (d.inputFormats?.mimeTypes ?? []).filter((m) => m.startsWith("image/")),
+      documentMimeTypes: (d.inputFormats?.mimeTypes ?? []).filter((m) => !m.startsWith("image/")),
+      inputMethods: d.inputFormats?.inputMethods ?? ["base64", "url"],
+      maxFileSize: d.inputFormats?.maxFileSize ?? 50,
+      // 50MB limit
+      maxPages: d.inputFormats?.maxPages ?? 1e3
+    },
+    pricing: {
+      model: "per-page",
+      perPage: d.pricing?.perPage ?? 2e-3,
+      // $2/1000 pages
+      currency: "USD",
+      notes: d.pricing?.notes ?? "$2 per 1000 pages"
+    },
+    rateLimits: {
+      docsPerMinute: d.apiConfig?.rateLimit?.docsPerMinute
+    },
+    raw: d
+  };
+}
 // src/retry.ts
 var DEFAULT_RETRY_CONFIG = {