@doclo/core 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +34 -0
- package/dist/index.d.ts +931 -0
- package/dist/index.js +2293 -0
- package/dist/index.js.map +1 -0
- package/dist/internal/validation-utils.d.ts +1 -0
- package/dist/internal/validation-utils.js +650 -0
- package/dist/internal/validation-utils.js.map +1 -0
- package/dist/observability/index.d.ts +933 -0
- package/dist/observability/index.js +630 -0
- package/dist/observability/index.js.map +1 -0
- package/dist/pdf-utils.d.ts +123 -0
- package/dist/pdf-utils.js +106 -0
- package/dist/pdf-utils.js.map +1 -0
- package/dist/runtime/base64.d.ts +100 -0
- package/dist/runtime/base64.js +52 -0
- package/dist/runtime/base64.js.map +1 -0
- package/dist/runtime/crypto.d.ts +56 -0
- package/dist/runtime/crypto.js +35 -0
- package/dist/runtime/crypto.js.map +1 -0
- package/dist/runtime/env.d.ts +130 -0
- package/dist/runtime/env.js +76 -0
- package/dist/runtime/env.js.map +1 -0
- package/dist/security/index.d.ts +236 -0
- package/dist/security/index.js +260 -0
- package/dist/security/index.js.map +1 -0
- package/dist/validation-CzOz6fwq.d.ts +1126 -0
- package/dist/validation.d.ts +1 -0
- package/dist/validation.js +445 -0
- package/dist/validation.js.map +1 -0
- package/package.json +70 -0
|
@@ -0,0 +1,1126 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Provider Identity Types
|
|
3
|
+
*
|
|
4
|
+
* Implements the 3-layer hierarchy for provider identification:
|
|
5
|
+
* 1. Provider (Company/Vendor) - e.g., datalab, openai, anthropic
|
|
6
|
+
* 2. Model - e.g., surya, marker-ocr, claude-sonnet-4.5
|
|
7
|
+
* 3. Method - e.g., native, openrouter, self-hosted
|
|
8
|
+
*/
|
|
9
|
+
/**
|
|
10
|
+
* Provider vendors (companies)
|
|
11
|
+
* These represent the company or organization providing the service
|
|
12
|
+
*/
|
|
13
|
+
type ProviderVendor = 'datalab' | 'reducto' | 'unsiloed' | 'openai' | 'anthropic' | 'google' | 'xai';
|
|
14
|
+
/**
|
|
15
|
+
* Access methods for providers
|
|
16
|
+
* - native: Direct API call to provider's official endpoint
|
|
17
|
+
* - openrouter: Via OpenRouter aggregator (LLM only)
|
|
18
|
+
* - self-hosted: Self-hosted instance (e.g., pip install surya-ocr)
|
|
19
|
+
*/
|
|
20
|
+
type AccessMethod = 'native' | 'openrouter' | 'self-hosted';
|
|
21
|
+
/**
|
|
22
|
+
* Complete provider identity combining all three layers
|
|
23
|
+
*/
|
|
24
|
+
interface ProviderIdentity {
|
|
25
|
+
/** The company/vendor (e.g., 'datalab') */
|
|
26
|
+
readonly provider: ProviderVendor;
|
|
27
|
+
/** The specific model/version (e.g., 'surya', 'marker-vlm') */
|
|
28
|
+
readonly model: string;
|
|
29
|
+
/** How the provider is accessed (e.g., 'native', 'self-hosted') */
|
|
30
|
+
readonly method: AccessMethod;
|
|
31
|
+
}
|
|
32
|
+
/**
|
|
33
|
+
* Convert provider identity to canonical string format
|
|
34
|
+
* Format: "provider:model" (e.g., "datalab:surya")
|
|
35
|
+
*
|
|
36
|
+
* @example
|
|
37
|
+
* ```typescript
|
|
38
|
+
* toProviderString({ provider: 'datalab', model: 'surya', method: 'native' })
|
|
39
|
+
* // => "datalab:surya"
|
|
40
|
+
* ```
|
|
41
|
+
*/
|
|
42
|
+
declare function toProviderString(identity: ProviderIdentity): string;
|
|
43
|
+
/**
|
|
44
|
+
* Parse canonical provider string back to partial identity
|
|
45
|
+
* Note: method cannot be determined from string alone
|
|
46
|
+
*
|
|
47
|
+
* @example
|
|
48
|
+
* ```typescript
|
|
49
|
+
* parseProviderString("datalab:surya")
|
|
50
|
+
* // => { provider: 'datalab', model: 'surya' }
|
|
51
|
+
* ```
|
|
52
|
+
*/
|
|
53
|
+
declare function parseProviderString(str: string): {
|
|
54
|
+
provider: string;
|
|
55
|
+
model: string;
|
|
56
|
+
};
|
|
57
|
+
/**
|
|
58
|
+
* Check if an endpoint appears to be self-hosted
|
|
59
|
+
* Used to determine the access method for OCR providers
|
|
60
|
+
*/
|
|
61
|
+
declare function isLocalEndpoint(endpoint?: string): boolean;
|
|
62
|
+
/**
|
|
63
|
+
* Create a provider identity with inferred method
|
|
64
|
+
*
|
|
65
|
+
* @param provider - The vendor/company
|
|
66
|
+
* @param model - The model name
|
|
67
|
+
* @param opts - Options including endpoint for method inference
|
|
68
|
+
*/
|
|
69
|
+
declare function createIdentity(provider: ProviderVendor, model: string, opts?: {
|
|
70
|
+
endpoint?: string;
|
|
71
|
+
via?: 'openrouter' | 'native';
|
|
72
|
+
}): ProviderIdentity;
|
|
73
|
+
|
|
74
|
+
/**
|
|
75
|
+
* Browser-safe validation utilities
|
|
76
|
+
*
|
|
77
|
+
* This module contains all validation code with ZERO Node.js dependencies.
|
|
78
|
+
* It can be safely bundled for browser environments.
|
|
79
|
+
*/
|
|
80
|
+
/** Page-centric IR */
|
|
81
|
+
type BBox = {
|
|
82
|
+
x: number;
|
|
83
|
+
y: number;
|
|
84
|
+
w: number;
|
|
85
|
+
h: number;
|
|
86
|
+
};
|
|
87
|
+
type IRLine = {
|
|
88
|
+
text: string;
|
|
89
|
+
bbox?: BBox;
|
|
90
|
+
startChar?: number;
|
|
91
|
+
endChar?: number;
|
|
92
|
+
lineId?: string;
|
|
93
|
+
};
|
|
94
|
+
type IRPage = {
|
|
95
|
+
pageNumber?: number;
|
|
96
|
+
width: number;
|
|
97
|
+
height: number;
|
|
98
|
+
lines: IRLine[];
|
|
99
|
+
markdown?: string;
|
|
100
|
+
html?: string;
|
|
101
|
+
extras?: Record<string, unknown>;
|
|
102
|
+
};
|
|
103
|
+
/** Standard extras fields for DocumentIR */
|
|
104
|
+
type DocumentIRExtras = {
|
|
105
|
+
/** Total number of pages in the original document (for PDFs, DOCX, etc.) */
|
|
106
|
+
pageCount?: number;
|
|
107
|
+
/** Cost in USD for processing this document */
|
|
108
|
+
costUSD?: number;
|
|
109
|
+
/** Provider-specific raw response */
|
|
110
|
+
raw?: unknown;
|
|
111
|
+
/** For chunked documents: which chunk this is (0-indexed) */
|
|
112
|
+
chunkIndex?: number;
|
|
113
|
+
/** For chunked documents: total number of chunks */
|
|
114
|
+
totalChunks?: number;
|
|
115
|
+
/** For chunked documents: page range [startPage, endPage] (1-indexed, inclusive) */
|
|
116
|
+
pageRange?: [number, number];
|
|
117
|
+
/** For Unsiloed: total semantic chunks (not traditional pages) */
|
|
118
|
+
totalSemanticChunks?: number;
|
|
119
|
+
/** Allow arbitrary additional fields */
|
|
120
|
+
[key: string]: unknown;
|
|
121
|
+
};
|
|
122
|
+
type DocumentIR = {
|
|
123
|
+
pages: IRPage[];
|
|
124
|
+
extras?: DocumentIRExtras;
|
|
125
|
+
};
|
|
126
|
+
|
|
127
|
+
/** Provider capability contracts */
|
|
128
|
+
type OCRProvider = {
|
|
129
|
+
/** Full 3-layer identity (provider/model/method) */
|
|
130
|
+
identity?: ProviderIdentity;
|
|
131
|
+
/** Canonical name in "provider:model" format */
|
|
132
|
+
name: string;
|
|
133
|
+
parseToIR: (input: {
|
|
134
|
+
url?: string;
|
|
135
|
+
base64?: string;
|
|
136
|
+
}) => Promise<DocumentIR>;
|
|
137
|
+
};
|
|
138
|
+
/** Multimodal input for VLM providers */
|
|
139
|
+
type MultimodalInput = {
|
|
140
|
+
text?: string;
|
|
141
|
+
images?: Array<{
|
|
142
|
+
url?: string;
|
|
143
|
+
base64?: string;
|
|
144
|
+
mimeType: string;
|
|
145
|
+
}>;
|
|
146
|
+
pdfs?: Array<{
|
|
147
|
+
url?: string;
|
|
148
|
+
base64?: string;
|
|
149
|
+
fileId?: string;
|
|
150
|
+
}>;
|
|
151
|
+
};
|
|
152
|
+
/** Reasoning configuration (normalized across providers) */
|
|
153
|
+
type ReasoningConfig = {
|
|
154
|
+
/** Reasoning effort level: low (20% budget), medium (50%), high (80%) */
|
|
155
|
+
effort?: 'low' | 'medium' | 'high';
|
|
156
|
+
/** Exclude reasoning tokens from response (only use for accuracy, not visible) */
|
|
157
|
+
exclude?: boolean;
|
|
158
|
+
/** Enable reasoning with default (medium) effort */
|
|
159
|
+
enabled?: boolean;
|
|
160
|
+
};
|
|
161
|
+
/** Base LLM provider (text-only) */
|
|
162
|
+
type LLMProvider = {
|
|
163
|
+
/** Full 3-layer identity (provider/model/method) */
|
|
164
|
+
identity?: ProviderIdentity;
|
|
165
|
+
/** Canonical name in "provider:model" format */
|
|
166
|
+
name: string;
|
|
167
|
+
completeJson: (input: {
|
|
168
|
+
prompt: string;
|
|
169
|
+
schema: object;
|
|
170
|
+
max_tokens?: number;
|
|
171
|
+
reasoning?: ReasoningConfig;
|
|
172
|
+
}) => Promise<{
|
|
173
|
+
json: unknown;
|
|
174
|
+
rawText?: string;
|
|
175
|
+
costUSD?: number;
|
|
176
|
+
inputTokens?: number;
|
|
177
|
+
outputTokens?: number;
|
|
178
|
+
cacheCreationInputTokens?: number;
|
|
179
|
+
cacheReadInputTokens?: number;
|
|
180
|
+
}>;
|
|
181
|
+
};
|
|
182
|
+
/** Vision-capable LLM provider */
|
|
183
|
+
type VLMProvider = {
|
|
184
|
+
/** Full 3-layer identity (provider/model/method) */
|
|
185
|
+
identity?: ProviderIdentity;
|
|
186
|
+
/** Canonical name in "provider:model" format */
|
|
187
|
+
name: string;
|
|
188
|
+
completeJson: (input: {
|
|
189
|
+
prompt: string | MultimodalInput;
|
|
190
|
+
schema: object;
|
|
191
|
+
max_tokens?: number;
|
|
192
|
+
reasoning?: ReasoningConfig;
|
|
193
|
+
}) => Promise<{
|
|
194
|
+
json: unknown;
|
|
195
|
+
rawText?: string;
|
|
196
|
+
costUSD?: number;
|
|
197
|
+
inputTokens?: number;
|
|
198
|
+
outputTokens?: number;
|
|
199
|
+
cacheCreationInputTokens?: number;
|
|
200
|
+
cacheReadInputTokens?: number;
|
|
201
|
+
}>;
|
|
202
|
+
capabilities: {
|
|
203
|
+
supportsImages: true;
|
|
204
|
+
supportsPDFs: boolean;
|
|
205
|
+
maxPDFPages?: number;
|
|
206
|
+
};
|
|
207
|
+
};
|
|
208
|
+
/** Legacy alias for backward compatibility */
|
|
209
|
+
type LLMJsonProvider = VLMProvider;
|
|
210
|
+
/**
|
|
211
|
+
* Processing quality/speed tradeoff modes
|
|
212
|
+
* Providers map their specific modes to these normalized values
|
|
213
|
+
*/
|
|
214
|
+
type ProcessingMode = 'fast' | 'balanced' | 'high_accuracy';
|
|
215
|
+
/**
|
|
216
|
+
* Page range specification for partial document processing
|
|
217
|
+
* Allows processing a subset of pages for cost savings
|
|
218
|
+
*/
|
|
219
|
+
type PageRangeOptions = {
|
|
220
|
+
/** Process only the first N pages */
|
|
221
|
+
maxPages?: number;
|
|
222
|
+
/** Specific page range (0-indexed), e.g., "0,2-4,10" */
|
|
223
|
+
pageRange?: string;
|
|
224
|
+
};
|
|
225
|
+
/**
|
|
226
|
+
* Language hints for OCR processing
|
|
227
|
+
*/
|
|
228
|
+
type LanguageOptions = {
|
|
229
|
+
/** ISO language codes for OCR, e.g., ['en', 'de', 'fr'] */
|
|
230
|
+
langs?: string[];
|
|
231
|
+
};
|
|
232
|
+
/**
|
|
233
|
+
* Document segmentation result for splitting "stapled" PDFs
|
|
234
|
+
* Returns page boundaries for each detected document type
|
|
235
|
+
*/
|
|
236
|
+
type SegmentationResult = {
|
|
237
|
+
segments: Array<{
|
|
238
|
+
/** Document type name (e.g., 'invoice', 'contract') */
|
|
239
|
+
name: string;
|
|
240
|
+
/** Page indices (0-indexed) belonging to this segment */
|
|
241
|
+
pages: number[];
|
|
242
|
+
/** Confidence level of segmentation */
|
|
243
|
+
confidence: 'high' | 'medium' | 'low';
|
|
244
|
+
}>;
|
|
245
|
+
metadata: {
|
|
246
|
+
/** Total pages in the original document */
|
|
247
|
+
totalPages: number;
|
|
248
|
+
/** How segmentation was performed */
|
|
249
|
+
segmentationMethod: 'auto' | 'schema' | 'manual';
|
|
250
|
+
};
|
|
251
|
+
};
|
|
252
|
+
/**
|
|
253
|
+
* Extracted image from a document
|
|
254
|
+
* Represents figures, charts, or embedded images
|
|
255
|
+
*/
|
|
256
|
+
type ExtractedImage = {
|
|
257
|
+
/** Block ID or reference (provider-specific) */
|
|
258
|
+
id: string;
|
|
259
|
+
/** Page number where image appears (0-indexed) */
|
|
260
|
+
pageNumber: number;
|
|
261
|
+
/** Base64-encoded image data */
|
|
262
|
+
base64: string;
|
|
263
|
+
/** MIME type of the image */
|
|
264
|
+
mimeType: string;
|
|
265
|
+
/** Location on page (normalized 0-1 coordinates) */
|
|
266
|
+
bbox?: NormalizedBBox;
|
|
267
|
+
/** Caption text if detected */
|
|
268
|
+
caption?: string;
|
|
269
|
+
};
|
|
270
|
+
/**
|
|
271
|
+
* Extended OCR provider options (beyond basic parseToIR)
|
|
272
|
+
* These options are normalized across different OCR providers
|
|
273
|
+
*/
|
|
274
|
+
type OCRProviderOptions = PageRangeOptions & LanguageOptions & {
|
|
275
|
+
/** Processing quality/speed tradeoff */
|
|
276
|
+
mode?: ProcessingMode;
|
|
277
|
+
/** Force OCR even on text-based PDFs */
|
|
278
|
+
forceOCR?: boolean;
|
|
279
|
+
/** Extract embedded images from document */
|
|
280
|
+
extractImages?: boolean;
|
|
281
|
+
/** Add page delimiters to output */
|
|
282
|
+
paginate?: boolean;
|
|
283
|
+
/** Remove and redo existing OCR */
|
|
284
|
+
stripExistingOCR?: boolean;
|
|
285
|
+
};
|
|
286
|
+
/**
|
|
287
|
+
* Extended VLM provider options for document extraction
|
|
288
|
+
* These options are normalized across different VLM providers
|
|
289
|
+
*/
|
|
290
|
+
type VLMProviderOptions = PageRangeOptions & LanguageOptions & {
|
|
291
|
+
/** Processing quality/speed tradeoff */
|
|
292
|
+
mode?: ProcessingMode;
|
|
293
|
+
/** Force OCR even on text-based PDFs */
|
|
294
|
+
forceOCR?: boolean;
|
|
295
|
+
/** Additional prompt/instructions for extraction */
|
|
296
|
+
prompt?: string;
|
|
297
|
+
/** Schema for auto-segmentation of multi-document PDFs */
|
|
298
|
+
segmentationSchema?: object;
|
|
299
|
+
};
|
|
300
|
+
/**
|
|
301
|
+
* Provider citation from source document
|
|
302
|
+
* Maps extracted fields to their source locations
|
|
303
|
+
*/
|
|
304
|
+
type ProviderCitation = {
|
|
305
|
+
/** JSON path to extracted field (e.g., "invoice.total") */
|
|
306
|
+
fieldPath: string;
|
|
307
|
+
/** Source block IDs from the provider */
|
|
308
|
+
blockIds: string[];
|
|
309
|
+
/** Confidence score (0-1) */
|
|
310
|
+
confidence?: number;
|
|
311
|
+
};
|
|
312
|
+
/** Consensus configuration for any node */
|
|
313
|
+
type ConsensusConfig = {
|
|
314
|
+
runs: number;
|
|
315
|
+
strategy?: 'majority' | 'unanimous';
|
|
316
|
+
onTie?: 'random' | 'fail' | 'retry';
|
|
317
|
+
parallel?: boolean;
|
|
318
|
+
includeMetadata?: boolean;
|
|
319
|
+
level?: 'object' | 'field';
|
|
320
|
+
retryOnFailure?: boolean;
|
|
321
|
+
maxRetries?: number;
|
|
322
|
+
};
|
|
323
|
+
/** Individual consensus run result */
|
|
324
|
+
type ConsensusRunResult<T = any> = {
|
|
325
|
+
runIndex: number;
|
|
326
|
+
value: T | null;
|
|
327
|
+
success: boolean;
|
|
328
|
+
error?: string;
|
|
329
|
+
startTime: number;
|
|
330
|
+
endTime: number;
|
|
331
|
+
duration: number;
|
|
332
|
+
attempts?: number;
|
|
333
|
+
};
|
|
334
|
+
/** Field-level voting details */
|
|
335
|
+
type FieldVotingDetails = {
|
|
336
|
+
fieldPath: string;
|
|
337
|
+
values: Array<{
|
|
338
|
+
/** The actual value for this voting option - can be any JSON-serializable type */
|
|
339
|
+
value: unknown;
|
|
340
|
+
count: number;
|
|
341
|
+
percentage: number;
|
|
342
|
+
runIndices: number[];
|
|
343
|
+
}>;
|
|
344
|
+
/** The winning value from consensus - can be any JSON-serializable type */
|
|
345
|
+
winner: unknown;
|
|
346
|
+
isTie: boolean;
|
|
347
|
+
agreementScore: number;
|
|
348
|
+
};
|
|
349
|
+
/** Consensus execution metadata */
|
|
350
|
+
type ConsensusMetadata<T = unknown> = {
|
|
351
|
+
totalRuns: number;
|
|
352
|
+
successfulRuns: number;
|
|
353
|
+
failedRuns: number;
|
|
354
|
+
strategy: 'majority' | 'unanimous';
|
|
355
|
+
selectedResult: T;
|
|
356
|
+
selectedRunIndex: number;
|
|
357
|
+
confidence: 'high' | 'medium' | 'low';
|
|
358
|
+
overallAgreement: number;
|
|
359
|
+
fieldAgreement: Record<string, number>;
|
|
360
|
+
votingDetails: FieldVotingDetails[];
|
|
361
|
+
runs: ConsensusRunResult<T>[];
|
|
362
|
+
executionTime: number;
|
|
363
|
+
wasRetry: boolean;
|
|
364
|
+
tieBreakerUsed?: 'random' | 'retry' | 'fail' | null;
|
|
365
|
+
votingLevel?: 'object' | 'field';
|
|
366
|
+
isSyntheticResult?: boolean;
|
|
367
|
+
totalRetries?: number;
|
|
368
|
+
emptyResultsFiltered?: number;
|
|
369
|
+
};
|
|
370
|
+
/** Output with consensus metadata wrapper */
|
|
371
|
+
type OutputWithConsensus<T = unknown> = {
|
|
372
|
+
data: T;
|
|
373
|
+
consensus: ConsensusMetadata<T>;
|
|
374
|
+
};
|
|
375
|
+
/** Conditional type helper for consensus metadata */
|
|
376
|
+
type MaybeWithConsensusMetadata<T, Config> = Config extends {
|
|
377
|
+
includeMetadata: true;
|
|
378
|
+
} ? OutputWithConsensus<T> : T;
|
|
379
|
+
/** Flow input/output types */
|
|
380
|
+
type FlowInput = {
|
|
381
|
+
url?: string;
|
|
382
|
+
base64?: string;
|
|
383
|
+
pages?: number[];
|
|
384
|
+
bounds?: BBox;
|
|
385
|
+
};
|
|
386
|
+
/**
|
|
387
|
+
* All MIME types supported by at least one provider.
|
|
388
|
+
* This is the union of all provider capabilities.
|
|
389
|
+
*/
|
|
390
|
+
type SupportedMimeType = 'application/pdf' | 'image/jpeg' | 'image/png' | 'image/gif' | 'image/webp' | 'image/tiff' | 'image/bmp' | 'image/heic' | 'image/heif' | 'image/vnd.adobe.photoshop' | 'application/msword' | 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' | 'application/vnd.ms-excel' | 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' | 'application/vnd.ms-powerpoint' | 'application/vnd.openxmlformats-officedocument.presentationml.presentation' | 'application/vnd.oasis.opendocument.text' | 'application/vnd.oasis.opendocument.spreadsheet' | 'application/vnd.oasis.opendocument.presentation' | 'text/plain' | 'text/csv' | 'text/html' | 'application/rtf' | 'application/epub+zip';
|
|
391
|
+
/**
|
|
392
|
+
* Flow-level input validation configuration
|
|
393
|
+
*
|
|
394
|
+
* Allows specifying accepted MIME types for early validation
|
|
395
|
+
* before flow execution begins.
|
|
396
|
+
*/
|
|
397
|
+
type FlowInputValidation = {
|
|
398
|
+
/**
|
|
399
|
+
* List of accepted MIME types.
|
|
400
|
+
* If specified, input must match one of these types or validation fails.
|
|
401
|
+
* If empty/undefined, all supported types are accepted.
|
|
402
|
+
*/
|
|
403
|
+
acceptedFormats?: SupportedMimeType[];
|
|
404
|
+
/**
|
|
405
|
+
* Whether to throw on validation failure.
|
|
406
|
+
* @default true
|
|
407
|
+
*/
|
|
408
|
+
throwOnInvalid?: boolean;
|
|
409
|
+
};
|
|
410
|
+
type FlowResult<T = any> = {
|
|
411
|
+
output: T;
|
|
412
|
+
metrics: StepMetric[];
|
|
413
|
+
aggregated: AggregatedMetrics;
|
|
414
|
+
artifacts: Record<string, any>;
|
|
415
|
+
error?: Error;
|
|
416
|
+
};
|
|
417
|
+
type SplitDocument = {
|
|
418
|
+
type: string;
|
|
419
|
+
schema: object;
|
|
420
|
+
pages: number[];
|
|
421
|
+
bounds?: BBox;
|
|
422
|
+
input: FlowInput;
|
|
423
|
+
};
|
|
424
|
+
/** Citation and source tracking types */
|
|
425
|
+
/** Citation source type indicating data provenance */
|
|
426
|
+
type CitationSourceType = 'ocr' | 'vlm' | 'llm' | 'inferred';
|
|
427
|
+
/** Normalized bounding box (0-1 coordinates relative to page dimensions) */
|
|
428
|
+
type NormalizedBBox = {
|
|
429
|
+
x: number;
|
|
430
|
+
y: number;
|
|
431
|
+
w: number;
|
|
432
|
+
h: number;
|
|
433
|
+
};
|
|
434
|
+
/** Line-level citation reference with spatial information */
|
|
435
|
+
type LineCitation = {
|
|
436
|
+
pageNumber: number;
|
|
437
|
+
lineIndex: number;
|
|
438
|
+
bbox?: NormalizedBBox;
|
|
439
|
+
text: string;
|
|
440
|
+
confidence?: number;
|
|
441
|
+
sourceType: CitationSourceType;
|
|
442
|
+
startChar?: number;
|
|
443
|
+
endChar?: number;
|
|
444
|
+
};
|
|
445
|
+
/** Field-level citation mapping extracted values to sources */
|
|
446
|
+
type FieldCitation = {
|
|
447
|
+
fieldPath: string;
|
|
448
|
+
/** Extracted value - can be any JSON-serializable type */
|
|
449
|
+
value: unknown;
|
|
450
|
+
citations: LineCitation[];
|
|
451
|
+
reasoning?: string;
|
|
452
|
+
confidence?: number;
|
|
453
|
+
};
|
|
454
|
+
/** Citation configuration for nodes */
|
|
455
|
+
type CitationConfig = {
|
|
456
|
+
enabled: boolean;
|
|
457
|
+
includeTextSnippets?: boolean;
|
|
458
|
+
includeBoundingBoxes?: boolean;
|
|
459
|
+
includeConfidence?: boolean;
|
|
460
|
+
minConfidence?: number;
|
|
461
|
+
detectInferred?: boolean;
|
|
462
|
+
};
|
|
463
|
+
/** Extended output with citations */
|
|
464
|
+
type OutputWithCitations<T> = {
|
|
465
|
+
data: T;
|
|
466
|
+
citations: FieldCitation[];
|
|
467
|
+
metadata: {
|
|
468
|
+
totalPages?: number;
|
|
469
|
+
sourceType: CitationSourceType;
|
|
470
|
+
hasInferredValues?: boolean;
|
|
471
|
+
processingTime?: number;
|
|
472
|
+
};
|
|
473
|
+
};
|
|
474
|
+
/** Node configuration types */
|
|
475
|
+
type ParseNodeConfig = {
|
|
476
|
+
provider: OCRProvider | VLMProvider;
|
|
477
|
+
consensus?: ConsensusConfig;
|
|
478
|
+
chunked?: {
|
|
479
|
+
maxPagesPerChunk: number;
|
|
480
|
+
overlap?: number;
|
|
481
|
+
parallel?: boolean;
|
|
482
|
+
};
|
|
483
|
+
format?: 'text' | 'markdown' | 'html';
|
|
484
|
+
describeFigures?: boolean;
|
|
485
|
+
includeImages?: boolean;
|
|
486
|
+
additionalPrompt?: string;
|
|
487
|
+
citations?: CitationConfig;
|
|
488
|
+
promptRef?: string;
|
|
489
|
+
/**
|
|
490
|
+
* Optional custom variables for prompt rendering (e.g., language, strictMode, tenantId).
|
|
491
|
+
*
|
|
492
|
+
* Auto-injected variables (no need to pass manually):
|
|
493
|
+
* - format: From config.format
|
|
494
|
+
* - schema: Constructed schema (if applicable)
|
|
495
|
+
* - describeFigures: From config.describeFigures
|
|
496
|
+
* - citationsEnabled: From config.citations?.enabled
|
|
497
|
+
*
|
|
498
|
+
* Use promptVariables only for runtime context (localization, multi-tenancy, behavioral flags).
|
|
499
|
+
*/
|
|
500
|
+
promptVariables?: Record<string, any>;
|
|
501
|
+
/**
|
|
502
|
+
* Additional instructions to append to the default prompt.
|
|
503
|
+
* This provides a simple way to customize the prompt without creating a custom prompt asset.
|
|
504
|
+
* The instructions will be added after the main prompt content.
|
|
505
|
+
*
|
|
506
|
+
* @example
|
|
507
|
+
* ```typescript
|
|
508
|
+
* parse({
|
|
509
|
+
* provider: vlmProvider,
|
|
510
|
+
* format: 'markdown',
|
|
511
|
+
* additionalInstructions: "Pay special attention to preserving table structures and footnotes."
|
|
512
|
+
* })
|
|
513
|
+
* ```
|
|
514
|
+
*/
|
|
515
|
+
additionalInstructions?: string;
|
|
516
|
+
/**
|
|
517
|
+
* When using promptRef, automatically inject format instruction if {{format}} placeholder is not present.
|
|
518
|
+
* This ensures the UI format selection always takes effect.
|
|
519
|
+
* Default: true
|
|
520
|
+
*
|
|
521
|
+
* @example
|
|
522
|
+
* ```typescript
|
|
523
|
+
* parse({
|
|
524
|
+
* provider: vlmProvider,
|
|
525
|
+
* format: 'markdown',
|
|
526
|
+
* promptRef: 'my-custom-prompt',
|
|
527
|
+
* autoInjectFormat: false // Disable auto-injection
|
|
528
|
+
* })
|
|
529
|
+
* ```
|
|
530
|
+
*/
|
|
531
|
+
autoInjectFormat?: boolean;
|
|
532
|
+
/**
|
|
533
|
+
* Enable extended reasoning/thinking for VLM providers that support it.
|
|
534
|
+
* Only applies when using a VLM provider (not OCR).
|
|
535
|
+
*
|
|
536
|
+
* @example
|
|
537
|
+
* ```typescript
|
|
538
|
+
* parse({
|
|
539
|
+
* provider: vlmProvider,
|
|
540
|
+
* format: 'markdown',
|
|
541
|
+
* reasoning: { enabled: true, effort: 'medium' }
|
|
542
|
+
* })
|
|
543
|
+
* ```
|
|
544
|
+
*/
|
|
545
|
+
reasoning?: {
|
|
546
|
+
effort?: 'low' | 'medium' | 'high';
|
|
547
|
+
exclude?: boolean;
|
|
548
|
+
enabled?: boolean;
|
|
549
|
+
};
|
|
550
|
+
};
|
|
551
|
+
type SplitNodeConfig = {
|
|
552
|
+
provider: VLMProvider;
|
|
553
|
+
schemas: Record<string, object>;
|
|
554
|
+
includeOther?: boolean;
|
|
555
|
+
consensus?: ConsensusConfig;
|
|
556
|
+
schemaRef?: string;
|
|
557
|
+
/**
|
|
558
|
+
* Enable extended reasoning/thinking for providers that support it.
|
|
559
|
+
*
|
|
560
|
+
* @example
|
|
561
|
+
* ```typescript
|
|
562
|
+
* split({
|
|
563
|
+
* provider: vlmProvider,
|
|
564
|
+
* schemas: { invoice: invoiceSchema, receipt: receiptSchema },
|
|
565
|
+
* reasoning: { enabled: true, effort: 'high' }
|
|
566
|
+
* })
|
|
567
|
+
* ```
|
|
568
|
+
*/
|
|
569
|
+
reasoning?: {
|
|
570
|
+
effort?: 'low' | 'medium' | 'high';
|
|
571
|
+
exclude?: boolean;
|
|
572
|
+
enabled?: boolean;
|
|
573
|
+
};
|
|
574
|
+
};
|
|
575
|
+
type CategorizeNodeConfig = {
|
|
576
|
+
provider: LLMProvider | VLMProvider;
|
|
577
|
+
categories: string[];
|
|
578
|
+
consensus?: ConsensusConfig;
|
|
579
|
+
additionalPrompt?: string;
|
|
580
|
+
promptRef?: string;
|
|
581
|
+
/**
|
|
582
|
+
* Optional custom variables for prompt rendering (e.g., language, strictMode, tenantId).
|
|
583
|
+
*
|
|
584
|
+
* Auto-injected variables (no need to pass manually):
|
|
585
|
+
* - categories: From config.categories
|
|
586
|
+
* - documentText: Computed from DocumentIR input
|
|
587
|
+
*
|
|
588
|
+
* Use promptVariables only for runtime context (localization, multi-tenancy, behavioral flags).
|
|
589
|
+
*/
|
|
590
|
+
promptVariables?: Record<string, any>;
|
|
591
|
+
/**
|
|
592
|
+
* Additional instructions to append to the default prompt.
|
|
593
|
+
* This provides a simple way to customize the prompt without creating a custom prompt asset.
|
|
594
|
+
* The instructions will be added after the main prompt content.
|
|
595
|
+
*
|
|
596
|
+
* @example
|
|
597
|
+
* ```typescript
|
|
598
|
+
* categorize({
|
|
599
|
+
* provider: llmProvider,
|
|
600
|
+
* categories: ['invoice', 'receipt', 'contract'],
|
|
601
|
+
* additionalInstructions: "Consider the document's header and footer when categorizing."
|
|
602
|
+
* })
|
|
603
|
+
* ```
|
|
604
|
+
*/
|
|
605
|
+
additionalInstructions?: string;
|
|
606
|
+
/**
|
|
607
|
+
* Enable extended reasoning/thinking for providers that support it.
|
|
608
|
+
*
|
|
609
|
+
* @example
|
|
610
|
+
* ```typescript
|
|
611
|
+
* categorize({
|
|
612
|
+
* provider: vlmProvider,
|
|
613
|
+
* categories: ['invoice', 'receipt', 'contract'],
|
|
614
|
+
* reasoning: { enabled: true, effort: 'low' }
|
|
615
|
+
* })
|
|
616
|
+
* ```
|
|
617
|
+
*/
|
|
618
|
+
reasoning?: {
|
|
619
|
+
effort?: 'low' | 'medium' | 'high';
|
|
620
|
+
exclude?: boolean;
|
|
621
|
+
enabled?: boolean;
|
|
622
|
+
};
|
|
623
|
+
};
|
|
624
|
+
type ExtractNodeConfig<T = any> = {
|
|
625
|
+
provider: LLMProvider | VLMProvider;
|
|
626
|
+
schema: object | EnhancedExtractionSchema<T> | {
|
|
627
|
+
ref: string;
|
|
628
|
+
};
|
|
629
|
+
consensus?: ConsensusConfig;
|
|
630
|
+
reasoning?: {
|
|
631
|
+
effort?: 'low' | 'medium' | 'high';
|
|
632
|
+
exclude?: boolean;
|
|
633
|
+
enabled?: boolean;
|
|
634
|
+
};
|
|
635
|
+
additionalPrompt?: string;
|
|
636
|
+
citations?: CitationConfig;
|
|
637
|
+
promptRef?: string;
|
|
638
|
+
/**
|
|
639
|
+
* Optional custom variables for prompt rendering (e.g., language, strictMode, tenantId).
|
|
640
|
+
*
|
|
641
|
+
* Auto-injected variables (no need to pass manually):
|
|
642
|
+
* - schema: From config.schema
|
|
643
|
+
* - documentText: Computed from DocumentIR or FlowInput
|
|
644
|
+
* - schemaTitle: From schema.title or default "the provided schema"
|
|
645
|
+
* - schemaDescription: From schema.description or empty string
|
|
646
|
+
* - structuredFormat: Generated formatting instructions (for markdown/html)
|
|
647
|
+
*
|
|
648
|
+
* Use promptVariables only for runtime context (localization, multi-tenancy, behavioral flags).
|
|
649
|
+
*/
|
|
650
|
+
promptVariables?: Record<string, any>;
|
|
651
|
+
/**
|
|
652
|
+
* Additional instructions to append to the default prompt.
|
|
653
|
+
* This provides a simple way to customize the prompt without creating a custom prompt asset.
|
|
654
|
+
* The instructions will be added after the main prompt content.
|
|
655
|
+
*
|
|
656
|
+
* @example
|
|
657
|
+
* ```typescript
|
|
658
|
+
* extract({
|
|
659
|
+
* provider: llmProvider,
|
|
660
|
+
* schema: mySchema,
|
|
661
|
+
* additionalInstructions: "Be strict with date formats. Use YYYY-MM-DD format only."
|
|
662
|
+
* })
|
|
663
|
+
* ```
|
|
664
|
+
*/
|
|
665
|
+
additionalInstructions?: string;
|
|
666
|
+
};
|
|
667
|
+
/** Chunk output structure */
|
|
668
|
+
type ChunkMetadata = {
|
|
669
|
+
content: string;
|
|
670
|
+
id: string;
|
|
671
|
+
index: number;
|
|
672
|
+
startChar: number;
|
|
673
|
+
endChar: number;
|
|
674
|
+
pageNumbers: number[];
|
|
675
|
+
section?: string;
|
|
676
|
+
headers?: string[];
|
|
677
|
+
strategy: string;
|
|
678
|
+
tokenCount?: number;
|
|
679
|
+
wordCount: number;
|
|
680
|
+
charCount: number;
|
|
681
|
+
};
|
|
682
|
+
type ChunkOutput = {
|
|
683
|
+
chunks: ChunkMetadata[];
|
|
684
|
+
totalChunks: number;
|
|
685
|
+
averageChunkSize: number;
|
|
686
|
+
sourceMetadata?: {
|
|
687
|
+
providerType?: string;
|
|
688
|
+
};
|
|
689
|
+
sourceDocument?: DocumentIR;
|
|
690
|
+
};
|
|
691
|
+
type ChunkNodeConfig = {
|
|
692
|
+
strategy: 'recursive' | 'section' | 'page' | 'fixed';
|
|
693
|
+
maxSize?: number;
|
|
694
|
+
minSize?: number;
|
|
695
|
+
overlap?: number;
|
|
696
|
+
separators?: string[];
|
|
697
|
+
pagesPerChunk?: number;
|
|
698
|
+
combineShortPages?: boolean;
|
|
699
|
+
minPageContent?: number;
|
|
700
|
+
size?: number;
|
|
701
|
+
unit?: 'tokens' | 'characters';
|
|
702
|
+
};
|
|
703
|
+
type CombineNodeConfig = {
|
|
704
|
+
strategy: 'merge' | 'concatenate' | 'first' | 'last';
|
|
705
|
+
};
|
|
706
|
+
type OutputNodeConfig = {
|
|
707
|
+
source?: string | string[];
|
|
708
|
+
transform?: 'first' | 'last' | 'merge' | 'pick' | 'custom';
|
|
709
|
+
fields?: string[];
|
|
710
|
+
name?: string;
|
|
711
|
+
/**
|
|
712
|
+
* Custom transform function for 'custom' transform mode.
|
|
713
|
+
* @param inputs - The input value(s) from the source step(s)
|
|
714
|
+
* @param artifacts - All artifacts from the flow execution
|
|
715
|
+
* @returns The transformed output value
|
|
716
|
+
*/
|
|
717
|
+
customTransform?: (inputs: unknown | unknown[], artifacts: Record<string, unknown>) => unknown;
|
|
718
|
+
};
|
|
719
|
+
/** Enhanced extraction schema with examples and guidance */
|
|
720
|
+
type EnhancedExtractionSchema<T = unknown> = {
|
|
721
|
+
schema: object;
|
|
722
|
+
examples?: Array<{
|
|
723
|
+
description: string;
|
|
724
|
+
input: string;
|
|
725
|
+
output: T;
|
|
726
|
+
}>;
|
|
727
|
+
extractionRules?: string;
|
|
728
|
+
contextPrompt?: string;
|
|
729
|
+
hints?: string[];
|
|
730
|
+
};
|
|
731
|
+
/** Node & runner */
|
|
732
|
+
type StepMetric = {
|
|
733
|
+
step: string;
|
|
734
|
+
configStepId?: string;
|
|
735
|
+
startMs: number;
|
|
736
|
+
provider?: string;
|
|
737
|
+
model?: string;
|
|
738
|
+
ms: number;
|
|
739
|
+
costUSD?: number;
|
|
740
|
+
inputTokens?: number;
|
|
741
|
+
outputTokens?: number;
|
|
742
|
+
cacheCreationInputTokens?: number;
|
|
743
|
+
cacheReadInputTokens?: number;
|
|
744
|
+
attemptNumber?: number;
|
|
745
|
+
metadata?: {
|
|
746
|
+
kind?: 'leaf' | 'wrapper' | 'prep';
|
|
747
|
+
rollup?: boolean;
|
|
748
|
+
overheadMs?: number;
|
|
749
|
+
/** Additional metadata fields */
|
|
750
|
+
[key: string]: string | number | boolean | undefined;
|
|
751
|
+
};
|
|
752
|
+
};
|
|
753
|
+
/** Aggregated metrics for multi-step flows */
|
|
754
|
+
interface AggregatedMetrics {
|
|
755
|
+
totalDurationMs: number;
|
|
756
|
+
totalCostUSD: number;
|
|
757
|
+
totalInputTokens: number;
|
|
758
|
+
totalOutputTokens: number;
|
|
759
|
+
totalCacheCreationTokens: number;
|
|
760
|
+
totalCacheReadTokens: number;
|
|
761
|
+
stepCount: number;
|
|
762
|
+
byProvider: Record<string, {
|
|
763
|
+
costUSD: number;
|
|
764
|
+
inputTokens: number;
|
|
765
|
+
outputTokens: number;
|
|
766
|
+
callCount: number;
|
|
767
|
+
}>;
|
|
768
|
+
}
|
|
769
|
+
/**
|
|
770
|
+
* Aggregate metrics from multiple steps
|
|
771
|
+
* @param metrics - Array of step metrics
|
|
772
|
+
* @returns Aggregated totals and per-provider breakdowns
|
|
773
|
+
*/
|
|
774
|
+
declare function aggregateMetrics(metrics: StepMetric[]): AggregatedMetrics;
|
|
775
|
+
/**
|
|
776
|
+
* Execution context passed to conditional functions and trigger nodes
|
|
777
|
+
* Provides access to artifacts and metrics from all previous steps
|
|
778
|
+
*/
|
|
779
|
+
interface FlowContext {
|
|
780
|
+
/** Outputs from all completed steps, indexed by step ID */
|
|
781
|
+
artifacts: Record<string, any>;
|
|
782
|
+
/** Performance metrics from all completed steps */
|
|
783
|
+
metrics: StepMetric[];
|
|
784
|
+
/** Call stack for tracking nested flow execution (for circular dependency detection) */
|
|
785
|
+
callStack?: string[];
|
|
786
|
+
/** Maximum nesting depth for flow triggers (default: 10) */
|
|
787
|
+
maxDepth?: number;
|
|
788
|
+
}
|
|
789
|
+
/**
|
|
790
|
+
* W3C Trace Context for distributed tracing.
|
|
791
|
+
* Compatible with observability module's TraceContext.
|
|
792
|
+
*/
|
|
793
|
+
interface TraceContextLite {
|
|
794
|
+
traceId: string;
|
|
795
|
+
spanId: string;
|
|
796
|
+
parentSpanId?: string;
|
|
797
|
+
traceFlags: number;
|
|
798
|
+
traceState?: string;
|
|
799
|
+
}
|
|
800
|
+
/**
|
|
801
|
+
* Observability context passed to node executions.
|
|
802
|
+
* Uses 'any' for config and traceContext to avoid circular imports and
|
|
803
|
+
* maintain compatibility with the full observability types.
|
|
804
|
+
*/
|
|
805
|
+
type NodeObservabilityContext = {
|
|
806
|
+
/** Observability configuration - full type in observability module */
|
|
807
|
+
config?: any;
|
|
808
|
+
flowId?: string;
|
|
809
|
+
executionId?: string;
|
|
810
|
+
stepId?: string;
|
|
811
|
+
stepIndex?: number;
|
|
812
|
+
/** W3C Trace Context - compatible with TraceContext from observability module */
|
|
813
|
+
traceContext?: any;
|
|
814
|
+
metadata?: Record<string, unknown>;
|
|
815
|
+
};
|
|
816
|
+
type NodeCtx = {
|
|
817
|
+
stepId?: string;
|
|
818
|
+
artifacts: Record<string, unknown>;
|
|
819
|
+
emit: (key: string, value: unknown) => void;
|
|
820
|
+
metrics: {
|
|
821
|
+
push: (m: StepMetric) => void;
|
|
822
|
+
};
|
|
823
|
+
/** Observability context for hooks (optional) */
|
|
824
|
+
observability?: NodeObservabilityContext;
|
|
825
|
+
};
|
|
826
|
+
/** Node type metadata for runtime validation */
|
|
827
|
+
type NodeTypeInfo = {
|
|
828
|
+
/** Input types this node accepts (e.g., ['FlowInput', 'DocumentIR']) */
|
|
829
|
+
inputTypes: string[];
|
|
830
|
+
/**
|
|
831
|
+
* Output type this node produces - can be string or function for config-dependent types.
|
|
832
|
+
* When a function, it receives the node's specific config and returns the output type string.
|
|
833
|
+
* Uses 'any' parameter to allow nodes to use their specific config types.
|
|
834
|
+
*/
|
|
835
|
+
outputType: string | ((config: any) => string);
|
|
836
|
+
/** Provider types this node requires (if any) */
|
|
837
|
+
requiresProvider?: ('OCR' | 'VLM' | 'LLM')[];
|
|
838
|
+
/** Whether this node can accept array input */
|
|
839
|
+
acceptsArray?: boolean;
|
|
840
|
+
/**
|
|
841
|
+
* Whether this node always outputs an array (or function for config-dependent).
|
|
842
|
+
* Uses 'any' parameter to allow nodes to use their specific config types.
|
|
843
|
+
*/
|
|
844
|
+
outputsArray?: boolean | ((config: any) => boolean);
|
|
845
|
+
/** Human-readable description of what this node does */
|
|
846
|
+
description?: string;
|
|
847
|
+
};
|
|
848
|
+
type NodeDef<I, O> = {
|
|
849
|
+
key: string;
|
|
850
|
+
run: (input: I, ctx: NodeCtx) => Promise<O>;
|
|
851
|
+
/** Optional type metadata for validation */
|
|
852
|
+
__meta?: NodeTypeInfo;
|
|
853
|
+
};
|
|
854
|
+
declare const node: <I, O>(key: string, run: NodeDef<I, O>["run"]) => NodeDef<I, O>;
|
|
855
|
+
declare function runPipeline(steps: NodeDef<any, any>[], input: any, observabilityContext?: NodeObservabilityContext): Promise<{
|
|
856
|
+
output: any;
|
|
857
|
+
artifacts: Record<string, unknown>;
|
|
858
|
+
metrics: StepMetric[];
|
|
859
|
+
}>;
|
|
860
|
+
/**
|
|
861
|
+
* Flow execution error with step context
|
|
862
|
+
*
|
|
863
|
+
* Thrown when a flow step fails during execution. Includes:
|
|
864
|
+
* - Which step failed (ID, index, type)
|
|
865
|
+
* - Which steps completed successfully
|
|
866
|
+
* - Partial artifacts from completed steps (for debugging)
|
|
867
|
+
* - The original error that caused the failure
|
|
868
|
+
*
|
|
869
|
+
* This makes debugging flow failures much easier by showing exactly where the error occurred
|
|
870
|
+
* and what data was produced before the failure.
|
|
871
|
+
*
|
|
872
|
+
* @example
|
|
873
|
+
* ```typescript
|
|
874
|
+
* try {
|
|
875
|
+
* await flow.run(input);
|
|
876
|
+
* } catch (error) {
|
|
877
|
+
* if (error instanceof FlowExecutionError) {
|
|
878
|
+
* console.error(`Failed at step ${error.failedStepIndex}: ${error.failedStepType}`);
|
|
879
|
+
* console.error(`Step ID: ${error.failedStep}`);
|
|
880
|
+
* console.error(`Completed: ${error.completedSteps.join(', ')}`);
|
|
881
|
+
* console.error(`Original error: ${error.originalError.message}`);
|
|
882
|
+
*
|
|
883
|
+
* // Access partial results from completed steps
|
|
884
|
+
* if (error.partialArtifacts?.qualify) {
|
|
885
|
+
* console.log('Quality assessment completed:', error.partialArtifacts.qualify);
|
|
886
|
+
* }
|
|
887
|
+
* }
|
|
888
|
+
* }
|
|
889
|
+
* ```
|
|
890
|
+
*/
|
|
891
|
+
declare class FlowExecutionError extends Error {
|
|
892
|
+
/** The ID of the step that failed (e.g., 'parse_node123') */
|
|
893
|
+
readonly failedStep: string;
|
|
894
|
+
/** The index of the failed step in the flow (0-based) */
|
|
895
|
+
readonly failedStepIndex: number;
|
|
896
|
+
/** The type of the failed step (e.g., 'parse', 'extract', 'step', 'conditional', 'forEach') */
|
|
897
|
+
readonly failedStepType: string;
|
|
898
|
+
/** Array of step IDs that completed successfully before the failure */
|
|
899
|
+
readonly completedSteps: string[];
|
|
900
|
+
/** The original error that caused the failure */
|
|
901
|
+
readonly originalError: Error;
|
|
902
|
+
/** Partial artifacts from steps that completed before the failure */
|
|
903
|
+
readonly partialArtifacts?: Record<string, any> | undefined;
|
|
904
|
+
constructor(message: string,
|
|
905
|
+
/** The ID of the step that failed (e.g., 'parse_node123') */
|
|
906
|
+
failedStep: string,
|
|
907
|
+
/** The index of the failed step in the flow (0-based) */
|
|
908
|
+
failedStepIndex: number,
|
|
909
|
+
/** The type of the failed step (e.g., 'parse', 'extract', 'step', 'conditional', 'forEach') */
|
|
910
|
+
failedStepType: string,
|
|
911
|
+
/** Array of step IDs that completed successfully before the failure */
|
|
912
|
+
completedSteps: string[],
|
|
913
|
+
/** The original error that caused the failure */
|
|
914
|
+
originalError: Error,
|
|
915
|
+
/** Partial artifacts from steps that completed before the failure */
|
|
916
|
+
partialArtifacts?: Record<string, any> | undefined);
|
|
917
|
+
}
|
|
918
|
+
/**
|
|
919
|
+
* Flow validation error for invalid node connections
|
|
920
|
+
*
|
|
921
|
+
* Thrown when building a flow with incompatible node connections.
|
|
922
|
+
* Provides helpful error messages and suggestions for fixing the issue.
|
|
923
|
+
*
|
|
924
|
+
* @example
|
|
925
|
+
* ```typescript
|
|
926
|
+
* try {
|
|
927
|
+
* const flow = createFlow()
|
|
928
|
+
* .step('parse', parse({ provider: ocrProvider }))
|
|
929
|
+
* .step('combine', combine()) // Invalid: combine needs array input
|
|
930
|
+
* .build();
|
|
931
|
+
* } catch (error) {
|
|
932
|
+
* if (error instanceof FlowValidationError) {
|
|
933
|
+
* console.error(error.message);
|
|
934
|
+
* console.error('Reason:', error.reason);
|
|
935
|
+
* console.log('Suggestions:', error.suggestions?.join('\n'));
|
|
936
|
+
* }
|
|
937
|
+
* }
|
|
938
|
+
* ```
|
|
939
|
+
*/
|
|
940
|
+
declare class FlowValidationError extends Error {
|
|
941
|
+
readonly reason?: string | undefined;
|
|
942
|
+
readonly suggestions?: string[] | undefined;
|
|
943
|
+
readonly sourceNode?: string | undefined;
|
|
944
|
+
readonly targetNode?: string | undefined;
|
|
945
|
+
readonly sourceOutputType?: string | undefined;
|
|
946
|
+
readonly targetInputTypes?: string[] | undefined;
|
|
947
|
+
constructor(message: string, reason?: string | undefined, suggestions?: string[] | undefined, sourceNode?: string | undefined, targetNode?: string | undefined, sourceOutputType?: string | undefined, targetInputTypes?: string[] | undefined);
|
|
948
|
+
}
|
|
949
|
+
/** Node type names for validation */
|
|
950
|
+
type NodeTypeName = 'parse' | 'split' | 'categorize' | 'extract' | 'chunk' | 'combine' | 'trigger' | 'output';
|
|
951
|
+
/** Compatibility rule for node connections */
|
|
952
|
+
type CompatibilityRule = {
|
|
953
|
+
valid: boolean;
|
|
954
|
+
requiresForEach?: boolean;
|
|
955
|
+
/** Indicates this connection cannot be fully validated at build-time and requires runtime type checking */
|
|
956
|
+
requiresRuntimeValidation?: boolean;
|
|
957
|
+
reason?: string;
|
|
958
|
+
note?: string;
|
|
959
|
+
};
|
|
960
|
+
/**
|
|
961
|
+
* Node Compatibility Matrix
|
|
962
|
+
*
|
|
963
|
+
* Defines which nodes can connect to which other nodes.
|
|
964
|
+
* This is the single source of truth for node connection validation.
|
|
965
|
+
*
|
|
966
|
+
* Rules based on input/output type compatibility:
|
|
967
|
+
* - parse: FlowInput → DocumentIR (or DocumentIR[] if chunked)
|
|
968
|
+
* - split: FlowInput → SplitDocument[] (requires forEach)
|
|
969
|
+
* - categorize: DocumentIR|FlowInput → {input, category}
|
|
970
|
+
* - extract: DocumentIR|FlowInput|ChunkOutput → T (typed JSON)
|
|
971
|
+
* - chunk: DocumentIR|DocumentIR[] → ChunkOutput
|
|
972
|
+
* - combine: T[] → T|T[] (merges forEach results)
|
|
973
|
+
* - trigger: any → TOutput (depends on child flow)
|
|
974
|
+
*
|
|
975
|
+
* Special behaviors:
|
|
976
|
+
* - forEach auto-unwraps SplitDocument.input → FlowInput
|
|
977
|
+
* - Conditional auto-unwraps {input, category} → input
|
|
978
|
+
* - parse with chunked:true outputs DocumentIR[] instead of DocumentIR
|
|
979
|
+
*/
|
|
980
|
+
declare const NODE_COMPATIBILITY_MATRIX: Record<NodeTypeName, Record<NodeTypeName, CompatibilityRule>>;
|
|
981
|
+
/**
|
|
982
|
+
* Get node type name from a NodeDef
|
|
983
|
+
* @param node - Node definition
|
|
984
|
+
* @returns Node type name (e.g., 'parse', 'extract')
|
|
985
|
+
*/
|
|
986
|
+
declare function getNodeTypeName(node: NodeDef<any, any>): NodeTypeName | null;
|
|
987
|
+
/**
|
|
988
|
+
* Get type information from a node
|
|
989
|
+
* @param node - Node definition
|
|
990
|
+
* @returns NodeTypeInfo if available
|
|
991
|
+
*/
|
|
992
|
+
declare function getNodeTypeInfo(node: NodeDef<any, any>): NodeTypeInfo | null;
|
|
993
|
+
/**
|
|
994
|
+
* Get compatible target nodes for a given source node
|
|
995
|
+
* @param sourceType - Source node type name
|
|
996
|
+
* @param includeForEach - Include connections that require forEach
|
|
997
|
+
* @returns Array of compatible target node types
|
|
998
|
+
*/
|
|
999
|
+
declare function getCompatibleTargets(sourceType: NodeTypeName, includeForEach?: boolean): NodeTypeName[];
|
|
1000
|
+
/**
|
|
1001
|
+
* Get suggested connections when a connection is invalid
|
|
1002
|
+
* @param sourceType - Source node type name
|
|
1003
|
+
* @returns Array of suggestion strings
|
|
1004
|
+
*/
|
|
1005
|
+
declare function getSuggestedConnections(sourceType: NodeTypeName): string[];
|
|
1006
|
+
/**
|
|
1007
|
+
* Validation result for node connections
|
|
1008
|
+
*/
|
|
1009
|
+
type ValidationResult = {
|
|
1010
|
+
valid: boolean;
|
|
1011
|
+
reason?: string;
|
|
1012
|
+
suggestions?: string[];
|
|
1013
|
+
requiresForEach?: boolean;
|
|
1014
|
+
/** Warning message for connections that are valid but require runtime type checking */
|
|
1015
|
+
warning?: string;
|
|
1016
|
+
};
|
|
1017
|
+
/**
|
|
1018
|
+
* Validate if two node types can be connected
|
|
1019
|
+
* @param sourceType - Source node type name
|
|
1020
|
+
* @param targetType - Target node type name
|
|
1021
|
+
* @param forEachEnabled - Whether forEach is enabled on the source node
|
|
1022
|
+
* @returns Validation result with reason and suggestions
|
|
1023
|
+
*/
|
|
1024
|
+
declare function validateNodeConnection(sourceType: NodeTypeName, targetType: NodeTypeName, forEachEnabled?: boolean): ValidationResult;
|
|
1025
|
+
/**
|
|
1026
|
+
* Get valid starting nodes for forEach itemFlow based on parent node type
|
|
1027
|
+
*
|
|
1028
|
+
* When a node outputs an array and uses forEach, the itemFlow receives individual
|
|
1029
|
+
* array items. This function returns which node types can accept those items.
|
|
1030
|
+
*
|
|
1031
|
+
* @param parentType - The node type that outputs the array (e.g., 'split', 'parse')
|
|
1032
|
+
* @returns Array of node types that can start the forEach itemFlow
|
|
1033
|
+
*
|
|
1034
|
+
* @example
|
|
1035
|
+
* ```typescript
|
|
1036
|
+
* // split outputs SplitDocument[], itemFlow gets SplitDocument
|
|
1037
|
+
* getValidForEachStarters('split') // ['parse', 'extract', 'categorize', 'trigger']
|
|
1038
|
+
*
|
|
1039
|
+
* // parse(chunked:true) outputs DocumentIR[], itemFlow gets DocumentIR
|
|
1040
|
+
* getValidForEachStarters('parse') // ['categorize', 'extract', 'chunk']
|
|
1041
|
+
* ```
|
|
1042
|
+
*/
|
|
1043
|
+
declare function getValidForEachStarters(parentType: NodeTypeName): NodeTypeName[];
|
|
1044
|
+
/**
|
|
1045
|
+
* Validate if a node type can start a forEach itemFlow for a given parent
|
|
1046
|
+
*
|
|
1047
|
+
* @param parentType - The node type that outputs the array (e.g., 'split')
|
|
1048
|
+
* @param starterType - The node type to validate as itemFlow starter
|
|
1049
|
+
* @returns ValidationResult with detailed error messages and suggestions
|
|
1050
|
+
*
|
|
1051
|
+
* @example
|
|
1052
|
+
* ```typescript
|
|
1053
|
+
* // Valid: split → forEach → parse
|
|
1054
|
+
* canStartForEachItemFlow('split', 'parse') // { valid: true }
|
|
1055
|
+
*
|
|
1056
|
+
* // Invalid: split → forEach → chunk
|
|
1057
|
+
* canStartForEachItemFlow('split', 'chunk')
|
|
1058
|
+
* // {
|
|
1059
|
+
* // valid: false,
|
|
1060
|
+
* // reason: 'chunk cannot start forEach itemFlow after split...',
|
|
1061
|
+
* // suggestions: ['Valid starters: parse, extract, categorize, trigger']
|
|
1062
|
+
* // }
|
|
1063
|
+
* ```
|
|
1064
|
+
*/
|
|
1065
|
+
declare function canStartForEachItemFlow(parentType: NodeTypeName, starterType: NodeTypeName): ValidationResult;
|
|
1066
|
+
/**
|
|
1067
|
+
* JSON Schema node structure for validation.
|
|
1068
|
+
* Represents a node in a JSON Schema definition.
|
|
1069
|
+
*/
|
|
1070
|
+
interface JSONSchemaNode {
|
|
1071
|
+
type?: string | string[];
|
|
1072
|
+
properties?: Record<string, JSONSchemaNode>;
|
|
1073
|
+
items?: JSONSchemaNode | JSONSchemaNode[];
|
|
1074
|
+
required?: string[];
|
|
1075
|
+
enum?: (string | number | boolean | null)[];
|
|
1076
|
+
nullable?: boolean;
|
|
1077
|
+
anyOf?: JSONSchemaNode[];
|
|
1078
|
+
oneOf?: JSONSchemaNode[];
|
|
1079
|
+
allOf?: JSONSchemaNode[];
|
|
1080
|
+
const?: unknown;
|
|
1081
|
+
additionalProperties?: boolean | JSONSchemaNode;
|
|
1082
|
+
minLength?: number;
|
|
1083
|
+
maxLength?: number;
|
|
1084
|
+
minimum?: number;
|
|
1085
|
+
maximum?: number;
|
|
1086
|
+
minItems?: number;
|
|
1087
|
+
maxItems?: number;
|
|
1088
|
+
pattern?: string;
|
|
1089
|
+
format?: string;
|
|
1090
|
+
description?: string;
|
|
1091
|
+
default?: unknown;
|
|
1092
|
+
$ref?: string;
|
|
1093
|
+
}
|
|
1094
|
+
/**
|
|
1095
|
+
* Lightweight JSON Schema validator for Edge Runtime compatibility
|
|
1096
|
+
*
|
|
1097
|
+
* Validates data against a JSON Schema without using AJV's code generation.
|
|
1098
|
+
* This is fully Edge Runtime compatible with zero dependencies.
|
|
1099
|
+
*
|
|
1100
|
+
* @param data - The data to validate
|
|
1101
|
+
* @param schema - JSON Schema object (plain object, not AJV JSONSchemaType)
|
|
1102
|
+
* @returns The validated data cast to type T
|
|
1103
|
+
* @throws Error if validation fails
|
|
1104
|
+
*/
|
|
1105
|
+
declare function validateJson<T>(data: unknown, schema: JSONSchemaNode): T;
|
|
1106
|
+
/**
|
|
1107
|
+
* Reserved variables that are auto-injected per node type.
|
|
1108
|
+
* These variables come from config or computed data and cannot be overridden by users.
|
|
1109
|
+
*/
|
|
1110
|
+
declare const RESERVED_VARIABLES: {
|
|
1111
|
+
readonly extract: readonly ["schema", "documentText", "schemaTitle", "schemaDescription", "structuredFormat"];
|
|
1112
|
+
readonly categorize: readonly ["categories", "documentText"];
|
|
1113
|
+
readonly parse: readonly ["format", "schema", "describeFigures", "citationsEnabled"];
|
|
1114
|
+
};
|
|
1115
|
+
/**
|
|
1116
|
+
* Validates that user-provided promptVariables don't attempt to override reserved variables.
|
|
1117
|
+
* Emits console warnings if reserved variables are found in user variables and removes them.
|
|
1118
|
+
*
|
|
1119
|
+
* @param nodeType - The type of node (extract, categorize, parse)
|
|
1120
|
+
* @param userVariables - The user-provided promptVariables object
|
|
1121
|
+
* @param autoInjectedVariables - The auto-injected variables object
|
|
1122
|
+
* @returns A cleaned variables object with reserved variables protected
|
|
1123
|
+
*/
|
|
1124
|
+
declare function protectReservedVariables(nodeType: 'extract' | 'categorize' | 'parse', userVariables: Record<string, any> | undefined, autoInjectedVariables: Record<string, any>): Record<string, any>;
|
|
1125
|
+
|
|
1126
|
+
export { type ExtractedImage as $, type AccessMethod as A, type BBox as B, type ConsensusConfig as C, type DocumentIR as D, type ExtractNodeConfig as E, type FieldVotingDetails as F, type FlowContext as G, type NodeCtx as H, type IRLine as I, type NodeTypeInfo as J, type NodeDef as K, type LLMProvider as L, type MultimodalInput as M, type NormalizedBBox as N, type OCRProvider as O, type ProviderVendor as P, type NodeTypeName as Q, type ReasoningConfig as R, type SplitDocument as S, type CompatibilityRule as T, type ValidationResult as U, type VLMProvider as V, type JSONSchemaNode as W, type ProcessingMode as X, type PageRangeOptions as Y, type LanguageOptions as Z, type SegmentationResult as _, type IRPage as a, type OCRProviderOptions as a0, type VLMProviderOptions as a1, type ProviderCitation as a2, aggregateMetrics as a3, node as a4, runPipeline as a5, FlowExecutionError as a6, FlowValidationError as a7, NODE_COMPATIBILITY_MATRIX as a8, getNodeTypeName as a9, getNodeTypeInfo as aa, getCompatibleTargets as ab, getSuggestedConnections as ac, validateNodeConnection as ad, getValidForEachStarters as ae, canStartForEachItemFlow as af, validateJson as ag, RESERVED_VARIABLES as ah, protectReservedVariables as ai, type ProviderIdentity as aj, toProviderString as ak, parseProviderString as al, isLocalEndpoint as am, createIdentity as an, type SupportedMimeType as ao, type TraceContextLite as ap, type NodeObservabilityContext as aq, type DocumentIRExtras as b, type LLMJsonProvider as c, type ConsensusRunResult as d, type ConsensusMetadata as e, type OutputWithConsensus as f, type MaybeWithConsensusMetadata as g, type FlowInput as h, type FlowInputValidation as i, type FlowResult as j, type CitationSourceType as k, type LineCitation as l, type FieldCitation as m, type CitationConfig as n, type OutputWithCitations as o, type ParseNodeConfig as p, type SplitNodeConfig as q, type CategorizeNodeConfig as r, type ChunkMetadata as s, type ChunkOutput as t, type ChunkNodeConfig as u, type CombineNodeConfig as v, type OutputNodeConfig as w, type EnhancedExtractionSchema as x, type StepMetric as y, type AggregatedMetrics as z };
|