@doclo/core 0.1.11 → 0.1.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +4 -2
- package/dist/index.js +128 -0
- package/dist/index.js.map +1 -1
- package/dist/internal/validation-utils.d.ts +1 -1
- package/dist/pdf-utils.d.ts +1 -1
- package/dist/{validation-D_EcHqPl.d.ts → validation-wlK06puw.d.ts} +1 -1
- package/dist/validation.d.ts +1 -1
- package/package.json +1 -1
package/dist/index.d.ts
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { P as ProviderVendor, A as AccessMethod } from './validation-
|
|
2
|
-
export { G as AggregatedMetrics, B as BBox, r as CategorizeNodeConfig, t as ChunkMetadata, v as ChunkNodeConfig, u as ChunkOutput, n as CitationConfig, k as CitationSourceType, w as CombineNodeConfig, U as CompatibilityRule, C as ConsensusConfig, e as ConsensusMetadata, d as ConsensusRunResult, D as DocumentIR, b as DocumentIRExtras, y as EnhancedExtractionSchema, s as ExtractInputMode, E as ExtractNodeConfig, a0 as ExtractedImage, m as FieldCitation, F as FieldVotingDetails, H as FlowContext, a8 as FlowExecutionError, h as FlowInput, i as FlowInputValidation, j as FlowResult, a4 as FlowStepLocation, a9 as FlowValidationError, I as IRLine, a as IRPage, X as JSONSchemaNode, c as LLMJsonProvider, L as LLMProvider, _ as LanguageOptions, l as LineCitation, g as MaybeWithConsensusMetadata, M as MultimodalInput, aa as NODE_COMPATIBILITY_MATRIX, J as NodeCtx, Q as NodeDef, K as NodeTypeInfo, T as NodeTypeName, N as NormalizedBBox, O as OCRProvider, a1 as OCRProviderOptions, x as OutputNodeConfig, o as OutputWithCitations, f as OutputWithConsensus, Z as PageRangeOptions, p as ParseNodeConfig, Y as ProcessingMode, a3 as ProviderCitation, am as ProviderIdentity, aj as RESERVED_VARIABLES, R as ReasoningConfig, $ as SegmentationResult, S as SplitDocument, q as SplitNodeConfig, z as StepMetric, V as VLMProvider, a2 as VLMProviderOptions, W as ValidationResult, a5 as aggregateMetrics, ah as canStartForEachItemFlow, aq as createIdentity, al as extractErrorMessage, ad as getCompatibleTargets, ac as getNodeTypeInfo, ab as getNodeTypeName, ae as getSuggestedConnections, ag as getValidForEachStarters, ap as isLocalEndpoint, a6 as node, ao as parseProviderString, ak as protectReservedVariables, a7 as runPipeline, an as toProviderString, ai as validateJson, af as validateNodeConnection } from './validation-
|
|
1
|
+
import { P as ProviderVendor, A as AccessMethod } from './validation-wlK06puw.js';
|
|
2
|
+
export { G as AggregatedMetrics, B as BBox, r as CategorizeNodeConfig, t as ChunkMetadata, v as ChunkNodeConfig, u as ChunkOutput, n as CitationConfig, k as CitationSourceType, w as CombineNodeConfig, U as CompatibilityRule, C as ConsensusConfig, e as ConsensusMetadata, d as ConsensusRunResult, D as DocumentIR, b as DocumentIRExtras, y as EnhancedExtractionSchema, s as ExtractInputMode, E as ExtractNodeConfig, a0 as ExtractedImage, m as FieldCitation, F as FieldVotingDetails, H as FlowContext, a8 as FlowExecutionError, h as FlowInput, i as FlowInputValidation, j as FlowResult, a4 as FlowStepLocation, a9 as FlowValidationError, I as IRLine, a as IRPage, X as JSONSchemaNode, c as LLMJsonProvider, L as LLMProvider, _ as LanguageOptions, l as LineCitation, g as MaybeWithConsensusMetadata, M as MultimodalInput, aa as NODE_COMPATIBILITY_MATRIX, J as NodeCtx, Q as NodeDef, K as NodeTypeInfo, T as NodeTypeName, N as NormalizedBBox, O as OCRProvider, a1 as OCRProviderOptions, x as OutputNodeConfig, o as OutputWithCitations, f as OutputWithConsensus, Z as PageRangeOptions, p as ParseNodeConfig, Y as ProcessingMode, a3 as ProviderCitation, am as ProviderIdentity, aj as RESERVED_VARIABLES, R as ReasoningConfig, $ as SegmentationResult, S as SplitDocument, q as SplitNodeConfig, z as StepMetric, V as VLMProvider, a2 as VLMProviderOptions, W as ValidationResult, a5 as aggregateMetrics, ah as canStartForEachItemFlow, aq as createIdentity, al as extractErrorMessage, ad as getCompatibleTargets, ac as getNodeTypeInfo, ab as getNodeTypeName, ae as getSuggestedConnections, ag as getValidForEachStarters, ap as isLocalEndpoint, a6 as node, ao as parseProviderString, ak as protectReservedVariables, a7 as runPipeline, an as toProviderString, ai as validateJson, af as validateNodeConnection } from './validation-wlK06puw.js';
|
|
3
3
|
export { getDocumentPageCount, getPDFPageCount, getPageCountMetadata, getTotalPageCount, splitPDFIntoChunks } from './pdf-utils.js';
|
|
4
4
|
|
|
5
5
|
/**
|
|
@@ -559,6 +559,8 @@ type NormalizedFeatures = {
|
|
|
559
559
|
schemaValidation: boolean;
|
|
560
560
|
/** Handwritten text recognition support */
|
|
561
561
|
handwrittenText: boolean;
|
|
562
|
+
/** Separate header/footer extraction from main content */
|
|
563
|
+
headerFooterExtraction: boolean;
|
|
562
564
|
/** Supported output formats */
|
|
563
565
|
outputFormats: OutputFormatSupport;
|
|
564
566
|
};
|
package/dist/index.js
CHANGED
|
@@ -1558,6 +1558,8 @@ function defaultNormalizer(id, data, source) {
|
|
|
1558
1558
|
return normalizeReductoProvider(id, d);
|
|
1559
1559
|
} else if (source === "unsiloed") {
|
|
1560
1560
|
return normalizeUnsiloedProvider(id, d);
|
|
1561
|
+
} else if (source === "mistral") {
|
|
1562
|
+
return normalizeMistralProvider(id, d);
|
|
1561
1563
|
}
|
|
1562
1564
|
const defaultOutputFormats = { text: true, markdown: false, html: false, json: false };
|
|
1563
1565
|
const defaultFeatures = {
|
|
@@ -1581,6 +1583,7 @@ function defaultNormalizer(id, data, source) {
|
|
|
1581
1583
|
boundingBoxes: false,
|
|
1582
1584
|
schemaValidation: false,
|
|
1583
1585
|
handwrittenText: false,
|
|
1586
|
+
headerFooterExtraction: false,
|
|
1584
1587
|
outputFormats: defaultOutputFormats
|
|
1585
1588
|
};
|
|
1586
1589
|
return {
|
|
@@ -1670,6 +1673,8 @@ function normalizeLLMProvider(id, d) {
|
|
|
1670
1673
|
// Some LLMs support schema validation
|
|
1671
1674
|
handwrittenText: false,
|
|
1672
1675
|
// Not specific to LLMs
|
|
1676
|
+
headerFooterExtraction: false,
|
|
1677
|
+
// LLMs don't extract header/footer separately
|
|
1673
1678
|
outputFormats
|
|
1674
1679
|
};
|
|
1675
1680
|
const vendor = d.vendor ?? id;
|
|
@@ -1777,6 +1782,8 @@ function normalizeDatalabProvider(id, d) {
|
|
|
1777
1782
|
// VLM providers support schema validation
|
|
1778
1783
|
handwrittenText: true,
|
|
1779
1784
|
// Datalab handles handwritten text
|
|
1785
|
+
headerFooterExtraction: false,
|
|
1786
|
+
// Datalab has issues with header/footer extraction
|
|
1780
1787
|
outputFormats
|
|
1781
1788
|
};
|
|
1782
1789
|
return {
|
|
@@ -1886,6 +1893,8 @@ function normalizeReductoProvider(id, d) {
|
|
|
1886
1893
|
// Extract has schema validation
|
|
1887
1894
|
handwrittenText: false,
|
|
1888
1895
|
// Reducto doesn't specifically advertise handwriting
|
|
1896
|
+
headerFooterExtraction: true,
|
|
1897
|
+
// Reducto has Header/Footer block types
|
|
1889
1898
|
outputFormats
|
|
1890
1899
|
};
|
|
1891
1900
|
return {
|
|
@@ -1999,6 +2008,8 @@ function normalizeUnsiloedProvider(id, d) {
|
|
|
1999
2008
|
// Extract supports schema validation
|
|
2000
2009
|
handwrittenText: false,
|
|
2001
2010
|
// Unsiloed doesn't specifically advertise handwriting
|
|
2011
|
+
headerFooterExtraction: false,
|
|
2012
|
+
// Unsiloed doesn't extract header/footer separately
|
|
2002
2013
|
outputFormats
|
|
2003
2014
|
};
|
|
2004
2015
|
return {
|
|
@@ -2288,6 +2299,123 @@ function getAllModels() {
|
|
|
2288
2299
|
function clearModelRegistry() {
|
|
2289
2300
|
modelRegistry.clear();
|
|
2290
2301
|
}
|
|
2302
|
+
function normalizeMistralProvider(id, d) {
|
|
2303
|
+
const opts = d.supportedOptions ?? {};
|
|
2304
|
+
const isVLM = d.type === "VLM";
|
|
2305
|
+
const isOCR = d.type === "OCR";
|
|
2306
|
+
const model = d.model ?? id;
|
|
2307
|
+
const outputFormats = {
|
|
2308
|
+
text: true,
|
|
2309
|
+
markdown: d.outputFormat?.features?.markdown ?? isOCR,
|
|
2310
|
+
html: d.outputFormat?.features?.htmlTables ?? isOCR,
|
|
2311
|
+
// OCR 3 can output HTML tables
|
|
2312
|
+
json: d.outputFormat?.features?.structuredJSON ?? isVLM
|
|
2313
|
+
};
|
|
2314
|
+
const features = {
|
|
2315
|
+
maxPages: d.inputFormats?.maxPages !== void 0,
|
|
2316
|
+
pageRange: true,
|
|
2317
|
+
// Mistral supports pages param: "0-5" or [0,2,5]
|
|
2318
|
+
languageHints: false,
|
|
2319
|
+
// Mistral doesn't support language hints
|
|
2320
|
+
processingModes: false,
|
|
2321
|
+
// Mistral doesn't have processing modes
|
|
2322
|
+
agenticMode: false,
|
|
2323
|
+
// Mistral doesn't have agentic mode
|
|
2324
|
+
customPrompts: false,
|
|
2325
|
+
// Mistral OCR 3 doesn't support custom prompts
|
|
2326
|
+
imageExtraction: opts.includeImageBase64 ?? false,
|
|
2327
|
+
// Can include embedded images
|
|
2328
|
+
pageMarkers: false,
|
|
2329
|
+
// Mistral doesn't add page markers
|
|
2330
|
+
citations: false,
|
|
2331
|
+
// Mistral doesn't provide citations
|
|
2332
|
+
chunking: false,
|
|
2333
|
+
// Mistral doesn't do chunking
|
|
2334
|
+
segmentation: false,
|
|
2335
|
+
// Mistral doesn't do segmentation
|
|
2336
|
+
stripExistingOCR: false,
|
|
2337
|
+
formatLines: false,
|
|
2338
|
+
forceOCR: true,
|
|
2339
|
+
// OCR 3 always does OCR
|
|
2340
|
+
tableOutputFormats: opts.tableFormat ?? isOCR,
|
|
2341
|
+
// html or markdown table format
|
|
2342
|
+
tableMerging: false,
|
|
2343
|
+
confidence: false,
|
|
2344
|
+
// Mistral doesn't provide confidence scores
|
|
2345
|
+
boundingBoxes: d.outputFormat?.features?.boundingBoxes ?? false,
|
|
2346
|
+
// NO text-level bboxes
|
|
2347
|
+
schemaValidation: d.outputFormat?.features?.schemaValidation ?? isVLM,
|
|
2348
|
+
// VLM supports schema
|
|
2349
|
+
handwrittenText: d.outputFormat?.features?.handwrittenText ?? true,
|
|
2350
|
+
// Excellent handwriting support
|
|
2351
|
+
headerFooterExtraction: opts.extractHeader ?? opts.extractFooter ?? false,
|
|
2352
|
+
// extract_header/extract_footer
|
|
2353
|
+
outputFormats
|
|
2354
|
+
};
|
|
2355
|
+
return {
|
|
2356
|
+
id: d.id ?? id,
|
|
2357
|
+
name: d.name ?? id,
|
|
2358
|
+
source: "mistral",
|
|
2359
|
+
type: d.type ?? "OCR",
|
|
2360
|
+
// 3-layer identity
|
|
2361
|
+
identity: {
|
|
2362
|
+
provider: "mistral",
|
|
2363
|
+
model,
|
|
2364
|
+
method: "native"
|
|
2365
|
+
},
|
|
2366
|
+
capabilities: {
|
|
2367
|
+
supportsImages: d.capabilities?.supportsImages ?? true,
|
|
2368
|
+
supportsPDFs: d.capabilities?.supportsPDFs ?? true,
|
|
2369
|
+
supportsDocuments: d.capabilities?.supportsDocuments ?? false,
|
|
2370
|
+
// DOCX/PPTX has known issues
|
|
2371
|
+
supportsReasoning: false,
|
|
2372
|
+
// OCR 3 doesn't do reasoning
|
|
2373
|
+
supportsStructuredOutput: d.capabilities?.supportsStructuredOutput ?? isVLM,
|
|
2374
|
+
// Extended capabilities
|
|
2375
|
+
supportsPrompts: false,
|
|
2376
|
+
supportsCitations: false,
|
|
2377
|
+
supportsChunking: false,
|
|
2378
|
+
supportsImageExtraction: opts.includeImageBase64 ?? false,
|
|
2379
|
+
supportsPageMarkers: false,
|
|
2380
|
+
supportsLanguageHints: false,
|
|
2381
|
+
supportsProcessingModes: false,
|
|
2382
|
+
supportsSegmentation: false,
|
|
2383
|
+
outputFormats
|
|
2384
|
+
},
|
|
2385
|
+
features,
|
|
2386
|
+
// Mistral providers always need raw document input
|
|
2387
|
+
inputRequirements: {
|
|
2388
|
+
inputType: d.inputRequirements?.inputType ?? "raw-document",
|
|
2389
|
+
acceptedMethods: d.inputRequirements?.acceptedMethods ?? ["base64", "url"]
|
|
2390
|
+
},
|
|
2391
|
+
compatibleNodes: {
|
|
2392
|
+
parse: d.compatibleNodes?.parse ?? isOCR,
|
|
2393
|
+
extract: d.compatibleNodes?.extract ?? isVLM,
|
|
2394
|
+
categorize: d.compatibleNodes?.categorize ?? false,
|
|
2395
|
+
qualify: d.compatibleNodes?.qualify ?? false,
|
|
2396
|
+
split: d.compatibleNodes?.split ?? false
|
|
2397
|
+
},
|
|
2398
|
+
inputFormats: {
|
|
2399
|
+
imageMimeTypes: (d.inputFormats?.mimeTypes ?? []).filter((m) => m.startsWith("image/")),
|
|
2400
|
+
documentMimeTypes: (d.inputFormats?.mimeTypes ?? []).filter((m) => !m.startsWith("image/")),
|
|
2401
|
+
inputMethods: d.inputFormats?.inputMethods ?? ["base64", "url"],
|
|
2402
|
+
maxFileSize: d.inputFormats?.maxFileSize ?? 50,
|
|
2403
|
+
// 50MB limit
|
|
2404
|
+
maxPages: d.inputFormats?.maxPages ?? 1e3
|
|
2405
|
+
},
|
|
2406
|
+
pricing: {
|
|
2407
|
+
model: "per-page",
|
|
2408
|
+
perPage: d.pricing?.perPage ?? 2e-3,
|
|
2409
|
+
// $2/1000 pages
|
|
2410
|
+
currency: "USD",
|
|
2411
|
+
notes: d.pricing?.notes ?? "$2 per 1000 pages"
|
|
2412
|
+
},
|
|
2413
|
+
rateLimits: {
|
|
2414
|
+
docsPerMinute: d.apiConfig?.rateLimit?.docsPerMinute
|
|
2415
|
+
},
|
|
2416
|
+
raw: d
|
|
2417
|
+
};
|
|
2418
|
+
}
|
|
2291
2419
|
|
|
2292
2420
|
// src/retry.ts
|
|
2293
2421
|
var DEFAULT_RETRY_CONFIG = {
|