@doclo/core 0.1.11 → 0.1.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -1,5 +1,5 @@
1
- import { P as ProviderVendor, A as AccessMethod } from './validation-D_EcHqPl.js';
2
- export { G as AggregatedMetrics, B as BBox, r as CategorizeNodeConfig, t as ChunkMetadata, v as ChunkNodeConfig, u as ChunkOutput, n as CitationConfig, k as CitationSourceType, w as CombineNodeConfig, U as CompatibilityRule, C as ConsensusConfig, e as ConsensusMetadata, d as ConsensusRunResult, D as DocumentIR, b as DocumentIRExtras, y as EnhancedExtractionSchema, s as ExtractInputMode, E as ExtractNodeConfig, a0 as ExtractedImage, m as FieldCitation, F as FieldVotingDetails, H as FlowContext, a8 as FlowExecutionError, h as FlowInput, i as FlowInputValidation, j as FlowResult, a4 as FlowStepLocation, a9 as FlowValidationError, I as IRLine, a as IRPage, X as JSONSchemaNode, c as LLMJsonProvider, L as LLMProvider, _ as LanguageOptions, l as LineCitation, g as MaybeWithConsensusMetadata, M as MultimodalInput, aa as NODE_COMPATIBILITY_MATRIX, J as NodeCtx, Q as NodeDef, K as NodeTypeInfo, T as NodeTypeName, N as NormalizedBBox, O as OCRProvider, a1 as OCRProviderOptions, x as OutputNodeConfig, o as OutputWithCitations, f as OutputWithConsensus, Z as PageRangeOptions, p as ParseNodeConfig, Y as ProcessingMode, a3 as ProviderCitation, am as ProviderIdentity, aj as RESERVED_VARIABLES, R as ReasoningConfig, $ as SegmentationResult, S as SplitDocument, q as SplitNodeConfig, z as StepMetric, V as VLMProvider, a2 as VLMProviderOptions, W as ValidationResult, a5 as aggregateMetrics, ah as canStartForEachItemFlow, aq as createIdentity, al as extractErrorMessage, ad as getCompatibleTargets, ac as getNodeTypeInfo, ab as getNodeTypeName, ae as getSuggestedConnections, ag as getValidForEachStarters, ap as isLocalEndpoint, a6 as node, ao as parseProviderString, ak as protectReservedVariables, a7 as runPipeline, an as toProviderString, ai as validateJson, af as validateNodeConnection } from './validation-D_EcHqPl.js';
1
+ import { P as ProviderVendor, A as AccessMethod } from './validation-wlK06puw.js';
2
+ export { G as AggregatedMetrics, B as BBox, r as CategorizeNodeConfig, t as ChunkMetadata, v as ChunkNodeConfig, u as ChunkOutput, n as CitationConfig, k as CitationSourceType, w as CombineNodeConfig, U as CompatibilityRule, C as ConsensusConfig, e as ConsensusMetadata, d as ConsensusRunResult, D as DocumentIR, b as DocumentIRExtras, y as EnhancedExtractionSchema, s as ExtractInputMode, E as ExtractNodeConfig, a0 as ExtractedImage, m as FieldCitation, F as FieldVotingDetails, H as FlowContext, a8 as FlowExecutionError, h as FlowInput, i as FlowInputValidation, j as FlowResult, a4 as FlowStepLocation, a9 as FlowValidationError, I as IRLine, a as IRPage, X as JSONSchemaNode, c as LLMJsonProvider, L as LLMProvider, _ as LanguageOptions, l as LineCitation, g as MaybeWithConsensusMetadata, M as MultimodalInput, aa as NODE_COMPATIBILITY_MATRIX, J as NodeCtx, Q as NodeDef, K as NodeTypeInfo, T as NodeTypeName, N as NormalizedBBox, O as OCRProvider, a1 as OCRProviderOptions, x as OutputNodeConfig, o as OutputWithCitations, f as OutputWithConsensus, Z as PageRangeOptions, p as ParseNodeConfig, Y as ProcessingMode, a3 as ProviderCitation, am as ProviderIdentity, aj as RESERVED_VARIABLES, R as ReasoningConfig, $ as SegmentationResult, S as SplitDocument, q as SplitNodeConfig, z as StepMetric, V as VLMProvider, a2 as VLMProviderOptions, W as ValidationResult, a5 as aggregateMetrics, ah as canStartForEachItemFlow, aq as createIdentity, al as extractErrorMessage, ad as getCompatibleTargets, ac as getNodeTypeInfo, ab as getNodeTypeName, ae as getSuggestedConnections, ag as getValidForEachStarters, ap as isLocalEndpoint, a6 as node, ao as parseProviderString, ak as protectReservedVariables, a7 as runPipeline, an as toProviderString, ai as validateJson, af as validateNodeConnection } from './validation-wlK06puw.js';
3
3
  export { getDocumentPageCount, getPDFPageCount, getPageCountMetadata, getTotalPageCount, splitPDFIntoChunks } from './pdf-utils.js';
4
4
 
5
5
  /**
@@ -559,6 +559,8 @@ type NormalizedFeatures = {
559
559
  schemaValidation: boolean;
560
560
  /** Handwritten text recognition support */
561
561
  handwrittenText: boolean;
562
+ /** Separate header/footer extraction from main content */
563
+ headerFooterExtraction: boolean;
562
564
  /** Supported output formats */
563
565
  outputFormats: OutputFormatSupport;
564
566
  };
package/dist/index.js CHANGED
@@ -1558,6 +1558,8 @@ function defaultNormalizer(id, data, source) {
1558
1558
  return normalizeReductoProvider(id, d);
1559
1559
  } else if (source === "unsiloed") {
1560
1560
  return normalizeUnsiloedProvider(id, d);
1561
+ } else if (source === "mistral") {
1562
+ return normalizeMistralProvider(id, d);
1561
1563
  }
1562
1564
  const defaultOutputFormats = { text: true, markdown: false, html: false, json: false };
1563
1565
  const defaultFeatures = {
@@ -1581,6 +1583,7 @@ function defaultNormalizer(id, data, source) {
1581
1583
  boundingBoxes: false,
1582
1584
  schemaValidation: false,
1583
1585
  handwrittenText: false,
1586
+ headerFooterExtraction: false,
1584
1587
  outputFormats: defaultOutputFormats
1585
1588
  };
1586
1589
  return {
@@ -1670,6 +1673,8 @@ function normalizeLLMProvider(id, d) {
1670
1673
  // Some LLMs support schema validation
1671
1674
  handwrittenText: false,
1672
1675
  // Not specific to LLMs
1676
+ headerFooterExtraction: false,
1677
+ // LLMs don't extract header/footer separately
1673
1678
  outputFormats
1674
1679
  };
1675
1680
  const vendor = d.vendor ?? id;
@@ -1777,6 +1782,8 @@ function normalizeDatalabProvider(id, d) {
1777
1782
  // VLM providers support schema validation
1778
1783
  handwrittenText: true,
1779
1784
  // Datalab handles handwritten text
1785
+ headerFooterExtraction: false,
1786
+ // Datalab has issues with header/footer extraction
1780
1787
  outputFormats
1781
1788
  };
1782
1789
  return {
@@ -1886,6 +1893,8 @@ function normalizeReductoProvider(id, d) {
1886
1893
  // Extract has schema validation
1887
1894
  handwrittenText: false,
1888
1895
  // Reducto doesn't specifically advertise handwriting
1896
+ headerFooterExtraction: true,
1897
+ // Reducto has Header/Footer block types
1889
1898
  outputFormats
1890
1899
  };
1891
1900
  return {
@@ -1999,6 +2008,8 @@ function normalizeUnsiloedProvider(id, d) {
1999
2008
  // Extract supports schema validation
2000
2009
  handwrittenText: false,
2001
2010
  // Unsiloed doesn't specifically advertise handwriting
2011
+ headerFooterExtraction: false,
2012
+ // Unsiloed doesn't extract header/footer separately
2002
2013
  outputFormats
2003
2014
  };
2004
2015
  return {
@@ -2288,6 +2299,123 @@ function getAllModels() {
2288
2299
  function clearModelRegistry() {
2289
2300
  modelRegistry.clear();
2290
2301
  }
2302
+ function normalizeMistralProvider(id, d) {
2303
+ const opts = d.supportedOptions ?? {};
2304
+ const isVLM = d.type === "VLM";
2305
+ const isOCR = d.type === "OCR";
2306
+ const model = d.model ?? id;
2307
+ const outputFormats = {
2308
+ text: true,
2309
+ markdown: d.outputFormat?.features?.markdown ?? isOCR,
2310
+ html: d.outputFormat?.features?.htmlTables ?? isOCR,
2311
+ // OCR 3 can output HTML tables
2312
+ json: d.outputFormat?.features?.structuredJSON ?? isVLM
2313
+ };
2314
+ const features = {
2315
+ maxPages: d.inputFormats?.maxPages !== void 0,
2316
+ pageRange: true,
2317
+ // Mistral supports pages param: "0-5" or [0,2,5]
2318
+ languageHints: false,
2319
+ // Mistral doesn't support language hints
2320
+ processingModes: false,
2321
+ // Mistral doesn't have processing modes
2322
+ agenticMode: false,
2323
+ // Mistral doesn't have agentic mode
2324
+ customPrompts: false,
2325
+ // Mistral OCR 3 doesn't support custom prompts
2326
+ imageExtraction: opts.includeImageBase64 ?? false,
2327
+ // Can include embedded images
2328
+ pageMarkers: false,
2329
+ // Mistral doesn't add page markers
2330
+ citations: false,
2331
+ // Mistral doesn't provide citations
2332
+ chunking: false,
2333
+ // Mistral doesn't do chunking
2334
+ segmentation: false,
2335
+ // Mistral doesn't do segmentation
2336
+ stripExistingOCR: false,
2337
+ formatLines: false,
2338
+ forceOCR: true,
2339
+ // OCR 3 always does OCR
2340
+ tableOutputFormats: opts.tableFormat ?? isOCR,
2341
+ // html or markdown table format
2342
+ tableMerging: false,
2343
+ confidence: false,
2344
+ // Mistral doesn't provide confidence scores
2345
+ boundingBoxes: d.outputFormat?.features?.boundingBoxes ?? false,
2346
+ // NO text-level bboxes
2347
+ schemaValidation: d.outputFormat?.features?.schemaValidation ?? isVLM,
2348
+ // VLM supports schema
2349
+ handwrittenText: d.outputFormat?.features?.handwrittenText ?? true,
2350
+ // Excellent handwriting support
2351
+ headerFooterExtraction: opts.extractHeader ?? opts.extractFooter ?? false,
2352
+ // extract_header/extract_footer
2353
+ outputFormats
2354
+ };
2355
+ return {
2356
+ id: d.id ?? id,
2357
+ name: d.name ?? id,
2358
+ source: "mistral",
2359
+ type: d.type ?? "OCR",
2360
+ // 3-layer identity
2361
+ identity: {
2362
+ provider: "mistral",
2363
+ model,
2364
+ method: "native"
2365
+ },
2366
+ capabilities: {
2367
+ supportsImages: d.capabilities?.supportsImages ?? true,
2368
+ supportsPDFs: d.capabilities?.supportsPDFs ?? true,
2369
+ supportsDocuments: d.capabilities?.supportsDocuments ?? false,
2370
+ // DOCX/PPTX has known issues
2371
+ supportsReasoning: false,
2372
+ // OCR 3 doesn't do reasoning
2373
+ supportsStructuredOutput: d.capabilities?.supportsStructuredOutput ?? isVLM,
2374
+ // Extended capabilities
2375
+ supportsPrompts: false,
2376
+ supportsCitations: false,
2377
+ supportsChunking: false,
2378
+ supportsImageExtraction: opts.includeImageBase64 ?? false,
2379
+ supportsPageMarkers: false,
2380
+ supportsLanguageHints: false,
2381
+ supportsProcessingModes: false,
2382
+ supportsSegmentation: false,
2383
+ outputFormats
2384
+ },
2385
+ features,
2386
+ // Mistral providers always need raw document input
2387
+ inputRequirements: {
2388
+ inputType: d.inputRequirements?.inputType ?? "raw-document",
2389
+ acceptedMethods: d.inputRequirements?.acceptedMethods ?? ["base64", "url"]
2390
+ },
2391
+ compatibleNodes: {
2392
+ parse: d.compatibleNodes?.parse ?? isOCR,
2393
+ extract: d.compatibleNodes?.extract ?? isVLM,
2394
+ categorize: d.compatibleNodes?.categorize ?? false,
2395
+ qualify: d.compatibleNodes?.qualify ?? false,
2396
+ split: d.compatibleNodes?.split ?? false
2397
+ },
2398
+ inputFormats: {
2399
+ imageMimeTypes: (d.inputFormats?.mimeTypes ?? []).filter((m) => m.startsWith("image/")),
2400
+ documentMimeTypes: (d.inputFormats?.mimeTypes ?? []).filter((m) => !m.startsWith("image/")),
2401
+ inputMethods: d.inputFormats?.inputMethods ?? ["base64", "url"],
2402
+ maxFileSize: d.inputFormats?.maxFileSize ?? 50,
2403
+ // 50MB limit
2404
+ maxPages: d.inputFormats?.maxPages ?? 1e3
2405
+ },
2406
+ pricing: {
2407
+ model: "per-page",
2408
+ perPage: d.pricing?.perPage ?? 2e-3,
2409
+ // $2/1000 pages
2410
+ currency: "USD",
2411
+ notes: d.pricing?.notes ?? "$2 per 1000 pages"
2412
+ },
2413
+ rateLimits: {
2414
+ docsPerMinute: d.apiConfig?.rateLimit?.docsPerMinute
2415
+ },
2416
+ raw: d
2417
+ };
2418
+ }
2291
2419
 
2292
2420
  // src/retry.ts
2293
2421
  var DEFAULT_RETRY_CONFIG = {