@doclo/core 0.1.12 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +130 -26
- package/dist/index.js +228 -34
- package/dist/index.js.map +1 -1
- package/dist/internal/validation-utils.d.ts +1 -1
- package/dist/internal/validation-utils.js.map +1 -1
- package/dist/pdf-utils.d.ts +1 -1
- package/dist/{validation-wlK06puw.d.ts → validation-B8GRTtww.d.ts} +43 -2
- package/dist/validation.d.ts +1 -1
- package/dist/validation.js.map +1 -1
- package/package.json +1 -1
package/dist/index.d.ts
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { P as ProviderVendor, A as AccessMethod } from './validation-
|
|
2
|
-
export { G as AggregatedMetrics, B as BBox, r as CategorizeNodeConfig, t as ChunkMetadata, v as ChunkNodeConfig, u as ChunkOutput, n as CitationConfig, k as CitationSourceType, w as CombineNodeConfig, U as CompatibilityRule, C as ConsensusConfig, e as ConsensusMetadata, d as ConsensusRunResult, D as DocumentIR, b as DocumentIRExtras, y as EnhancedExtractionSchema, s as ExtractInputMode, E as ExtractNodeConfig, a0 as ExtractedImage, m as FieldCitation, F as FieldVotingDetails, H as FlowContext, a8 as FlowExecutionError, h as FlowInput, i as FlowInputValidation, j as FlowResult, a4 as FlowStepLocation, a9 as FlowValidationError, I as IRLine, a as IRPage, X as JSONSchemaNode, c as LLMJsonProvider, L as LLMProvider, _ as LanguageOptions, l as LineCitation, g as MaybeWithConsensusMetadata, M as MultimodalInput, aa as NODE_COMPATIBILITY_MATRIX, J as NodeCtx, Q as NodeDef, K as NodeTypeInfo, T as NodeTypeName, N as NormalizedBBox, O as OCRProvider, a1 as OCRProviderOptions, x as OutputNodeConfig, o as OutputWithCitations, f as OutputWithConsensus, Z as PageRangeOptions, p as ParseNodeConfig, Y as ProcessingMode, a3 as ProviderCitation, am as ProviderIdentity, aj as RESERVED_VARIABLES, R as ReasoningConfig, $ as SegmentationResult, S as SplitDocument, q as SplitNodeConfig, z as StepMetric, V as VLMProvider, a2 as VLMProviderOptions, W as ValidationResult, a5 as aggregateMetrics, ah as canStartForEachItemFlow, aq as createIdentity, al as extractErrorMessage, ad as getCompatibleTargets, ac as getNodeTypeInfo, ab as getNodeTypeName, ae as getSuggestedConnections, ag as getValidForEachStarters, ap as isLocalEndpoint, a6 as node, ao as parseProviderString, ak as protectReservedVariables, a7 as runPipeline, an as toProviderString, ai as validateJson, af as validateNodeConnection } from './validation-
|
|
1
|
+
import { P as ProviderVendor, A as AccessMethod } from './validation-B8GRTtww.js';
|
|
2
|
+
export { G as AggregatedMetrics, B as BBox, r as CategorizeNodeConfig, t as ChunkMetadata, v as ChunkNodeConfig, u as ChunkOutput, n as CitationConfig, k as CitationSourceType, w as CombineNodeConfig, U as CompatibilityRule, C as ConsensusConfig, e as ConsensusMetadata, d as ConsensusRunResult, D as DocumentIR, b as DocumentIRExtras, y as EnhancedExtractionSchema, s as ExtractInputMode, E as ExtractNodeConfig, a0 as ExtractedImage, m as FieldCitation, F as FieldVotingDetails, H as FlowContext, a8 as FlowExecutionError, h as FlowInput, i as FlowInputValidation, j as FlowResult, a4 as FlowStepLocation, a9 as FlowValidationError, I as IRLine, a as IRPage, X as JSONSchemaNode, c as LLMJsonProvider, L as LLMProvider, _ as LanguageOptions, l as LineCitation, g as MaybeWithConsensusMetadata, M as MultimodalInput, aa as NODE_COMPATIBILITY_MATRIX, J as NodeCtx, Q as NodeDef, K as NodeTypeInfo, T as NodeTypeName, N as NormalizedBBox, O as OCRProvider, a1 as OCRProviderOptions, x as OutputNodeConfig, o as OutputWithCitations, f as OutputWithConsensus, Z as PageRangeOptions, p as ParseNodeConfig, Y as ProcessingMode, a3 as ProviderCitation, am as ProviderIdentity, aj as RESERVED_VARIABLES, R as ReasoningConfig, $ as SegmentationResult, S as SplitDocument, q as SplitNodeConfig, z as StepMetric, V as VLMProvider, a2 as VLMProviderOptions, W as ValidationResult, a5 as aggregateMetrics, ah as canStartForEachItemFlow, aq as createIdentity, al as extractErrorMessage, ad as getCompatibleTargets, ac as getNodeTypeInfo, ab as getNodeTypeName, ae as getSuggestedConnections, ag as getValidForEachStarters, ap as isLocalEndpoint, a6 as node, ao as parseProviderString, ak as protectReservedVariables, a7 as runPipeline, an as toProviderString, ai as validateJson, af as validateNodeConnection } from './validation-B8GRTtww.js';
|
|
3
3
|
export { getDocumentPageCount, getPDFPageCount, getPageCountMetadata, getTotalPageCount, splitPDFIntoChunks } from './pdf-utils.js';
|
|
4
4
|
|
|
5
5
|
/**
|
|
@@ -511,6 +511,22 @@ type OutputFormatSupport = {
|
|
|
511
511
|
html: boolean;
|
|
512
512
|
json: boolean;
|
|
513
513
|
};
|
|
514
|
+
/**
|
|
515
|
+
* Feature status values for normalized features.
|
|
516
|
+
* - `true`: Natively supported by the API
|
|
517
|
+
* - `false`: Not supported
|
|
518
|
+
* - `'deprecated'`: API deprecated this feature, may not work
|
|
519
|
+
* - `'derived'`: SDK provides via transformation (e.g., maxPages from pageRange)
|
|
520
|
+
*/
|
|
521
|
+
type FeatureStatus = true | false | 'deprecated' | 'derived';
|
|
522
|
+
/**
|
|
523
|
+
* Helper to check if a feature is enabled (true, deprecated, or derived)
|
|
524
|
+
*/
|
|
525
|
+
declare function isFeatureEnabled(status: FeatureStatus): boolean;
|
|
526
|
+
/**
|
|
527
|
+
* Page indexing convention used by provider
|
|
528
|
+
*/
|
|
529
|
+
type PageIndexing = '0-indexed' | '1-indexed';
|
|
514
530
|
/**
|
|
515
531
|
* Normalized features across all providers.
|
|
516
532
|
* Maps provider-specific option names to unified names.
|
|
@@ -520,47 +536,77 @@ type OutputFormatSupport = {
|
|
|
520
536
|
*/
|
|
521
537
|
type NormalizedFeatures = {
|
|
522
538
|
/** Limit to first N pages */
|
|
523
|
-
maxPages:
|
|
539
|
+
maxPages: FeatureStatus;
|
|
524
540
|
/** Specific page range selection */
|
|
525
|
-
pageRange:
|
|
541
|
+
pageRange: FeatureStatus;
|
|
526
542
|
/** OCR language hints (maps from 'langs') */
|
|
527
|
-
languageHints:
|
|
543
|
+
languageHints: FeatureStatus;
|
|
528
544
|
/** Quality/speed modes (fast/balanced/high_accuracy) */
|
|
529
|
-
processingModes:
|
|
545
|
+
processingModes: FeatureStatus;
|
|
530
546
|
/** Reducto agentic mode (higher accuracy, more cost) */
|
|
531
|
-
agenticMode:
|
|
547
|
+
agenticMode: FeatureStatus;
|
|
532
548
|
/** Custom prompts (maps from blockCorrectionPrompt, additionalPrompt, systemPrompt) */
|
|
533
|
-
customPrompts:
|
|
549
|
+
customPrompts: FeatureStatus;
|
|
534
550
|
/** Extract embedded images (maps from extractImages, returnImages) */
|
|
535
|
-
imageExtraction:
|
|
551
|
+
imageExtraction: FeatureStatus;
|
|
536
552
|
/** Page delimiters (maps from paginate, addPageMarkers) */
|
|
537
|
-
pageMarkers:
|
|
538
|
-
/** Field-level citations with source references */
|
|
539
|
-
citations:
|
|
553
|
+
pageMarkers: FeatureStatus;
|
|
554
|
+
/** Field-level citations with source references (page/char/block indices) */
|
|
555
|
+
citations: FeatureStatus;
|
|
540
556
|
/** Document chunking modes (RAG-optimized) */
|
|
541
|
-
chunking:
|
|
557
|
+
chunking: FeatureStatus;
|
|
542
558
|
/** Auto-segmentation for multi-document PDFs */
|
|
543
|
-
segmentation:
|
|
559
|
+
segmentation: FeatureStatus;
|
|
544
560
|
/** Re-run OCR on already-OCR'd documents */
|
|
545
|
-
stripExistingOCR:
|
|
561
|
+
stripExistingOCR: FeatureStatus;
|
|
546
562
|
/** Format lines in output */
|
|
547
|
-
formatLines:
|
|
563
|
+
formatLines: FeatureStatus;
|
|
548
564
|
/** Force OCR even if text layer exists */
|
|
549
|
-
forceOCR:
|
|
565
|
+
forceOCR: FeatureStatus;
|
|
550
566
|
/** Table format options (html/json/md/csv) */
|
|
551
|
-
tableOutputFormats:
|
|
567
|
+
tableOutputFormats: FeatureStatus;
|
|
552
568
|
/** Merge consecutive tables */
|
|
553
|
-
tableMerging:
|
|
569
|
+
tableMerging: FeatureStatus;
|
|
554
570
|
/** Block-level confidence scores */
|
|
555
|
-
confidence:
|
|
556
|
-
/** Bounding box coordinates for
|
|
557
|
-
boundingBoxes:
|
|
571
|
+
confidence: FeatureStatus;
|
|
572
|
+
/** Bounding box coordinates for TEXT elements (pixel/normalized coords) */
|
|
573
|
+
boundingBoxes: FeatureStatus;
|
|
574
|
+
/** Bounding box coordinates for IMAGES/FIGURES only (not text) */
|
|
575
|
+
imageBoundingBoxes: FeatureStatus;
|
|
558
576
|
/** JSON schema validation for structured output */
|
|
559
|
-
schemaValidation:
|
|
577
|
+
schemaValidation: FeatureStatus;
|
|
560
578
|
/** Handwritten text recognition support */
|
|
561
|
-
handwrittenText:
|
|
579
|
+
handwrittenText: FeatureStatus;
|
|
562
580
|
/** Separate header/footer extraction from main content */
|
|
563
|
-
headerFooterExtraction:
|
|
581
|
+
headerFooterExtraction: FeatureStatus;
|
|
582
|
+
/** Optimize output for embeddings/RAG */
|
|
583
|
+
embedOptimized: FeatureStatus;
|
|
584
|
+
/** Handle encrypted/password-protected PDFs */
|
|
585
|
+
passwordProtected: FeatureStatus;
|
|
586
|
+
/** Filter block types (headers, footers, page numbers, etc.) */
|
|
587
|
+
contentFiltering: FeatureStatus;
|
|
588
|
+
/** OCR system/mode selection (standard/legacy, auto/full) */
|
|
589
|
+
ocrMode: FeatureStatus;
|
|
590
|
+
/** Async completion webhook callbacks */
|
|
591
|
+
webhookCallback: FeatureStatus;
|
|
592
|
+
/** Vision quality control (low/medium/high) - Gemini */
|
|
593
|
+
mediaResolution: FeatureStatus;
|
|
594
|
+
/** Track changes extraction from Word docs */
|
|
595
|
+
changeTracking: FeatureStatus;
|
|
596
|
+
/** Extract hyperlinks from documents */
|
|
597
|
+
hyperlinkExtraction: FeatureStatus;
|
|
598
|
+
/** Enhanced chart and graph interpretation (Datalab extras=chart_understanding) */
|
|
599
|
+
chartUnderstanding: FeatureStatus;
|
|
600
|
+
/** Control image caption generation (Datalab disable_image_captions) */
|
|
601
|
+
imageCaptions: FeatureStatus;
|
|
602
|
+
/** Extract signatures from documents (Reducto include: ["signatures"]) */
|
|
603
|
+
signatureExtraction: FeatureStatus;
|
|
604
|
+
/** Extract comments/annotations from documents (Reducto include: ["comments"]) */
|
|
605
|
+
commentExtraction: FeatureStatus;
|
|
606
|
+
/** Extract highlighted text from documents (Reducto include: ["highlight"]) */
|
|
607
|
+
highlightExtraction: FeatureStatus;
|
|
608
|
+
/** Summarize figures/charts with VLM (Reducto summarize_figures) */
|
|
609
|
+
figureSummaries: FeatureStatus;
|
|
564
610
|
/** Supported output formats */
|
|
565
611
|
outputFormats: OutputFormatSupport;
|
|
566
612
|
};
|
|
@@ -929,6 +975,64 @@ declare function getAllModels(): ResolvedModelMetadata[];
|
|
|
929
975
|
* Clear model registry (useful for testing)
|
|
930
976
|
*/
|
|
931
977
|
declare function clearModelRegistry(): void;
|
|
978
|
+
/**
|
|
979
|
+
* Get the page indexing convention for a provider.
|
|
980
|
+
*
|
|
981
|
+
* @param provider - Provider metadata or source string
|
|
982
|
+
* @returns Page indexing convention ('0-indexed' or '1-indexed')
|
|
983
|
+
*/
|
|
984
|
+
declare function getPageIndexing(provider: NormalizedProviderMetadata | string): PageIndexing;
|
|
985
|
+
/**
|
|
986
|
+
* Options that can be transformed for derived features.
|
|
987
|
+
*/
|
|
988
|
+
type DerivedFeatureOptions = {
|
|
989
|
+
maxPages?: number;
|
|
990
|
+
pageRange?: string;
|
|
991
|
+
};
|
|
992
|
+
/**
|
|
993
|
+
* Result of derived feature transformation.
|
|
994
|
+
*/
|
|
995
|
+
type TransformedOptions = {
|
|
996
|
+
/** The transformed page_range parameter (provider-specific format) */
|
|
997
|
+
page_range?: string;
|
|
998
|
+
/** Array format for providers that support it (e.g., Mistral) */
|
|
999
|
+
pages?: number[];
|
|
1000
|
+
/** Original options minus the derived ones */
|
|
1001
|
+
remainingOptions: Record<string, unknown>;
|
|
1002
|
+
};
|
|
1003
|
+
/**
|
|
1004
|
+
* Transform maxPages to provider-specific pageRange format.
|
|
1005
|
+
*
|
|
1006
|
+
* This utility handles the conversion when a provider has `maxPages: 'derived'`,
|
|
1007
|
+
* meaning the SDK provides maxPages functionality via the underlying pageRange API.
|
|
1008
|
+
*
|
|
1009
|
+
* @param options - User-provided options including maxPages
|
|
1010
|
+
* @param provider - Provider metadata to determine format
|
|
1011
|
+
* @returns Transformed options with provider-specific pageRange
|
|
1012
|
+
*
|
|
1013
|
+
* @example
|
|
1014
|
+
* ```typescript
|
|
1015
|
+
* // User wants first 5 pages from Reducto (1-indexed)
|
|
1016
|
+
* const result = transformDerivedFeatures({ maxPages: 5 }, reductoProvider);
|
|
1017
|
+
* // => { page_range: '1-5', remainingOptions: {} }
|
|
1018
|
+
*
|
|
1019
|
+
* // User wants first 5 pages from Datalab (0-indexed)
|
|
1020
|
+
* const result = transformDerivedFeatures({ maxPages: 5 }, datalabProvider);
|
|
1021
|
+
* // => { page_range: '0-4', remainingOptions: {} }
|
|
1022
|
+
*
|
|
1023
|
+
* // User wants first 5 pages from Mistral (0-indexed, array format)
|
|
1024
|
+
* const result = transformDerivedFeatures({ maxPages: 5 }, mistralProvider);
|
|
1025
|
+
* // => { page_range: '0-4', pages: [0,1,2,3,4], remainingOptions: {} }
|
|
1026
|
+
* ```
|
|
1027
|
+
*/
|
|
1028
|
+
declare function transformDerivedFeatures(options: DerivedFeatureOptions & Record<string, unknown>, provider: NormalizedProviderMetadata): TransformedOptions;
|
|
1029
|
+
/**
|
|
1030
|
+
* Check if a provider requires derived feature transformation for maxPages.
|
|
1031
|
+
*
|
|
1032
|
+
* @param provider - Provider metadata
|
|
1033
|
+
* @returns true if maxPages needs to be transformed to pageRange
|
|
1034
|
+
*/
|
|
1035
|
+
declare function requiresMaxPagesTransformation(provider: NormalizedProviderMetadata): boolean;
|
|
932
1036
|
|
|
933
1037
|
/**
|
|
934
1038
|
* @doclo/core - Retry Utilities
|
|
@@ -1090,4 +1194,4 @@ declare function getCircuitBreaker(key: string): CircuitBreaker | undefined;
|
|
|
1090
1194
|
*/
|
|
1091
1195
|
declare function withRetry<T>(fn: () => Promise<T>, options?: WithRetryOptions<T>): Promise<T>;
|
|
1092
1196
|
|
|
1093
|
-
export { type AcceptedMimeType, AccessMethod, type AllAutoVariables, type AutoVariablesForNode, type BaseProviderConfig, type CategorizeAutoVariables, type CircuitBreaker, type CircuitBreakerConfig, type CircuitBreakerState, DEFAULT_CIRCUIT_BREAKER_CONFIG, DEFAULT_RETRY_CONFIG, type DocumentMimeType, type ExtractAutoVariables, type FeatureName, FlowInputValidationError, type InputRequirements, type ModelMetadata, type ModelQueryFilter, type NormalizedCapabilities, type NormalizedFeatures, type NormalizedProviderMetadata, type OCRProviderConfig, type OutputFormatSupport, type ParseAutoVariables, type PromptVariables, type ProviderConfig, type ProviderInputType, type ProviderInstance, type ProviderMetadataWithModels, type ProviderQueryFilter, type ProviderRegistry, type ProviderSecrets, ProviderVendor, type ResolvedModelMetadata, type RetryConfig, type VLMProviderConfig, type WithRetryOptions, bufferToBase64, bufferToDataUri, buildProviderFromConfig, buildProvidersFromConfigs, calculateRetryDelay, clearCircuitBreakers, clearModelRegistry, clearProviderRegistry, createCircuitBreaker, defineMarkerProvider, defineSuryaProvider, defineVLMProvider, detectDocumentType, detectMimeTypeFromBase64, detectMimeTypeFromBase64Async, detectMimeTypeFromBytes, extractBase64, extractStatusCode, getAllModels, getAllProviders, getCheapestProviderFor, getCircuitBreaker, getModelsForNode, getProviderById, getProvidersBySource, getProvidersForLargeFiles, getProvidersForMimeType, isPDFDocument, isRetryableError, parseRetryAfter, queryModels, queryProviders, registerProviderMetadata, registerProviderWithModels, resolveDocument, resolveModelMetadata, validateFlowInputFormat, validateMimeType, validateMimeTypeAsync, withRetry };
|
|
1197
|
+
export { type AcceptedMimeType, AccessMethod, type AllAutoVariables, type AutoVariablesForNode, type BaseProviderConfig, type CategorizeAutoVariables, type CircuitBreaker, type CircuitBreakerConfig, type CircuitBreakerState, DEFAULT_CIRCUIT_BREAKER_CONFIG, DEFAULT_RETRY_CONFIG, type DerivedFeatureOptions, type DocumentMimeType, type ExtractAutoVariables, type FeatureName, type FeatureStatus, FlowInputValidationError, type InputRequirements, type ModelMetadata, type ModelQueryFilter, type NormalizedCapabilities, type NormalizedFeatures, type NormalizedProviderMetadata, type OCRProviderConfig, type OutputFormatSupport, type PageIndexing, type ParseAutoVariables, type PromptVariables, type ProviderConfig, type ProviderInputType, type ProviderInstance, type ProviderMetadataWithModels, type ProviderQueryFilter, type ProviderRegistry, type ProviderSecrets, ProviderVendor, type ResolvedModelMetadata, type RetryConfig, type TransformedOptions, type VLMProviderConfig, type WithRetryOptions, bufferToBase64, bufferToDataUri, buildProviderFromConfig, buildProvidersFromConfigs, calculateRetryDelay, clearCircuitBreakers, clearModelRegistry, clearProviderRegistry, createCircuitBreaker, defineMarkerProvider, defineSuryaProvider, defineVLMProvider, detectDocumentType, detectMimeTypeFromBase64, detectMimeTypeFromBase64Async, detectMimeTypeFromBytes, extractBase64, extractStatusCode, getAllModels, getAllProviders, getCheapestProviderFor, getCircuitBreaker, getModelsForNode, getPageIndexing, getProviderById, getProvidersBySource, getProvidersForLargeFiles, getProvidersForMimeType, isFeatureEnabled, isPDFDocument, isRetryableError, parseRetryAfter, queryModels, queryProviders, registerProviderMetadata, registerProviderWithModels, requiresMaxPagesTransformation, resolveDocument, resolveModelMetadata, transformDerivedFeatures, validateFlowInputFormat, validateMimeType, validateMimeTypeAsync, withRetry };
|