@doclo/core 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +34 -0
- package/dist/index.d.ts +931 -0
- package/dist/index.js +2293 -0
- package/dist/index.js.map +1 -0
- package/dist/internal/validation-utils.d.ts +1 -0
- package/dist/internal/validation-utils.js +650 -0
- package/dist/internal/validation-utils.js.map +1 -0
- package/dist/observability/index.d.ts +933 -0
- package/dist/observability/index.js +630 -0
- package/dist/observability/index.js.map +1 -0
- package/dist/pdf-utils.d.ts +123 -0
- package/dist/pdf-utils.js +106 -0
- package/dist/pdf-utils.js.map +1 -0
- package/dist/runtime/base64.d.ts +100 -0
- package/dist/runtime/base64.js +52 -0
- package/dist/runtime/base64.js.map +1 -0
- package/dist/runtime/crypto.d.ts +56 -0
- package/dist/runtime/crypto.js +35 -0
- package/dist/runtime/crypto.js.map +1 -0
- package/dist/runtime/env.d.ts +130 -0
- package/dist/runtime/env.js +76 -0
- package/dist/runtime/env.js.map +1 -0
- package/dist/security/index.d.ts +236 -0
- package/dist/security/index.js +260 -0
- package/dist/security/index.js.map +1 -0
- package/dist/validation-CzOz6fwq.d.ts +1126 -0
- package/dist/validation.d.ts +1 -0
- package/dist/validation.js +445 -0
- package/dist/validation.js.map +1 -0
- package/package.json +70 -0
package/dist/index.js
ADDED
|
@@ -0,0 +1,2293 @@
|
|
|
1
|
+
// src/internal/validation-utils.ts
|
|
2
|
+
function aggregateMetrics(metrics) {
|
|
3
|
+
const byProvider = {};
|
|
4
|
+
const result = metrics.reduce((acc, m) => {
|
|
5
|
+
acc.totalDurationMs += m.ms;
|
|
6
|
+
acc.totalCostUSD += m.costUSD || 0;
|
|
7
|
+
acc.totalInputTokens += m.inputTokens || 0;
|
|
8
|
+
acc.totalOutputTokens += m.outputTokens || 0;
|
|
9
|
+
acc.totalCacheCreationTokens += m.cacheCreationInputTokens || 0;
|
|
10
|
+
acc.totalCacheReadTokens += m.cacheReadInputTokens || 0;
|
|
11
|
+
if (m.provider) {
|
|
12
|
+
if (!byProvider[m.provider]) {
|
|
13
|
+
byProvider[m.provider] = { costUSD: 0, inputTokens: 0, outputTokens: 0, callCount: 0 };
|
|
14
|
+
}
|
|
15
|
+
byProvider[m.provider].costUSD += m.costUSD || 0;
|
|
16
|
+
byProvider[m.provider].inputTokens += m.inputTokens || 0;
|
|
17
|
+
byProvider[m.provider].outputTokens += m.outputTokens || 0;
|
|
18
|
+
byProvider[m.provider].callCount += 1;
|
|
19
|
+
}
|
|
20
|
+
return acc;
|
|
21
|
+
}, {
|
|
22
|
+
totalDurationMs: 0,
|
|
23
|
+
totalCostUSD: 0,
|
|
24
|
+
totalInputTokens: 0,
|
|
25
|
+
totalOutputTokens: 0,
|
|
26
|
+
totalCacheCreationTokens: 0,
|
|
27
|
+
totalCacheReadTokens: 0,
|
|
28
|
+
stepCount: metrics.length,
|
|
29
|
+
byProvider
|
|
30
|
+
});
|
|
31
|
+
return result;
|
|
32
|
+
}
|
|
33
|
+
var node = (key, run) => ({ key, run });
|
|
34
|
+
async function runPipeline(steps, input, observabilityContext) {
|
|
35
|
+
const artifacts = {};
|
|
36
|
+
const metrics = [];
|
|
37
|
+
const ctx = {
|
|
38
|
+
stepId: observabilityContext?.stepId,
|
|
39
|
+
artifacts,
|
|
40
|
+
emit: (k, v) => {
|
|
41
|
+
artifacts[k] = v;
|
|
42
|
+
},
|
|
43
|
+
metrics: { push: (m) => metrics.push(m) },
|
|
44
|
+
observability: observabilityContext
|
|
45
|
+
};
|
|
46
|
+
let acc = input;
|
|
47
|
+
for (const s of steps) {
|
|
48
|
+
acc = await s.run(acc, ctx);
|
|
49
|
+
ctx.emit(s.key, acc);
|
|
50
|
+
}
|
|
51
|
+
return { output: acc, artifacts, metrics };
|
|
52
|
+
}
|
|
53
|
+
var FlowExecutionError = class _FlowExecutionError extends Error {
|
|
54
|
+
constructor(message, failedStep, failedStepIndex, failedStepType, completedSteps, originalError, partialArtifacts) {
|
|
55
|
+
super(message);
|
|
56
|
+
this.failedStep = failedStep;
|
|
57
|
+
this.failedStepIndex = failedStepIndex;
|
|
58
|
+
this.failedStepType = failedStepType;
|
|
59
|
+
this.completedSteps = completedSteps;
|
|
60
|
+
this.originalError = originalError;
|
|
61
|
+
this.partialArtifacts = partialArtifacts;
|
|
62
|
+
this.name = "FlowExecutionError";
|
|
63
|
+
if (Error.captureStackTrace) {
|
|
64
|
+
Error.captureStackTrace(this, _FlowExecutionError);
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
};
|
|
68
|
+
var FlowValidationError = class _FlowValidationError extends Error {
|
|
69
|
+
constructor(message, reason, suggestions, sourceNode, targetNode, sourceOutputType, targetInputTypes) {
|
|
70
|
+
super(message);
|
|
71
|
+
this.reason = reason;
|
|
72
|
+
this.suggestions = suggestions;
|
|
73
|
+
this.sourceNode = sourceNode;
|
|
74
|
+
this.targetNode = targetNode;
|
|
75
|
+
this.sourceOutputType = sourceOutputType;
|
|
76
|
+
this.targetInputTypes = targetInputTypes;
|
|
77
|
+
this.name = "FlowValidationError";
|
|
78
|
+
if (Error.captureStackTrace) {
|
|
79
|
+
Error.captureStackTrace(this, _FlowValidationError);
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
};
|
|
83
|
+
var NODE_COMPATIBILITY_MATRIX = {
|
|
84
|
+
parse: {
|
|
85
|
+
parse: {
|
|
86
|
+
valid: false,
|
|
87
|
+
reason: "Cannot chain parse nodes. Parse is typically the starting node."
|
|
88
|
+
},
|
|
89
|
+
split: {
|
|
90
|
+
valid: false,
|
|
91
|
+
reason: "Split requires FlowInput, but parse outputs DocumentIR. Use split directly on input instead.",
|
|
92
|
+
note: "If you need to re-split after parsing, use trigger to invoke a child flow with FlowInput."
|
|
93
|
+
},
|
|
94
|
+
categorize: {
|
|
95
|
+
valid: true,
|
|
96
|
+
note: "categorize accepts DocumentIR and wraps it with {input, category}"
|
|
97
|
+
},
|
|
98
|
+
extract: {
|
|
99
|
+
valid: true,
|
|
100
|
+
note: "extract accepts DocumentIR and produces typed JSON"
|
|
101
|
+
},
|
|
102
|
+
chunk: {
|
|
103
|
+
valid: true,
|
|
104
|
+
note: "chunk accepts DocumentIR and produces ChunkOutput for RAG"
|
|
105
|
+
},
|
|
106
|
+
combine: {
|
|
107
|
+
valid: false,
|
|
108
|
+
reason: "Parse outputs DocumentIR (single document), not an array. Combine requires array input from forEach.",
|
|
109
|
+
note: "Use parse with chunked:true to output DocumentIR[], then use combine."
|
|
110
|
+
},
|
|
111
|
+
trigger: {
|
|
112
|
+
valid: true,
|
|
113
|
+
note: "trigger accepts any input type"
|
|
114
|
+
},
|
|
115
|
+
output: {
|
|
116
|
+
valid: true,
|
|
117
|
+
note: "output node can follow any node to select or transform results"
|
|
118
|
+
}
|
|
119
|
+
},
|
|
120
|
+
split: {
|
|
121
|
+
parse: {
|
|
122
|
+
valid: true,
|
|
123
|
+
requiresForEach: true,
|
|
124
|
+
reason: "Split outputs SplitDocument[] which requires forEach. forEach auto-unwraps SplitDocument.input \u2192 FlowInput for parse.",
|
|
125
|
+
note: "Enable forEach on split node before connecting to parse."
|
|
126
|
+
},
|
|
127
|
+
split: {
|
|
128
|
+
valid: false,
|
|
129
|
+
reason: "Cannot nest split operations. Split nodes cannot appear in forEach itemFlow."
|
|
130
|
+
},
|
|
131
|
+
categorize: {
|
|
132
|
+
valid: true,
|
|
133
|
+
requiresForEach: true,
|
|
134
|
+
reason: "Split outputs SplitDocument[] which requires forEach. forEach auto-unwraps SplitDocument.input for categorize."
|
|
135
|
+
},
|
|
136
|
+
extract: {
|
|
137
|
+
valid: true,
|
|
138
|
+
requiresForEach: true,
|
|
139
|
+
reason: "Split outputs SplitDocument[] which requires forEach. forEach auto-unwraps SplitDocument.input for extract."
|
|
140
|
+
},
|
|
141
|
+
chunk: {
|
|
142
|
+
valid: false,
|
|
143
|
+
reason: "SplitDocument output is incompatible with Chunk input. Chunk expects DocumentIR or DocumentIR[].",
|
|
144
|
+
note: "Use parse in forEach after split to convert SplitDocument \u2192 DocumentIR, then chunk."
|
|
145
|
+
},
|
|
146
|
+
combine: {
|
|
147
|
+
valid: false,
|
|
148
|
+
reason: "Combine should appear AFTER forEach completes, not as a forEach itemFlow step.",
|
|
149
|
+
note: "Place combine after the forEach block to merge results."
|
|
150
|
+
},
|
|
151
|
+
trigger: {
|
|
152
|
+
valid: true,
|
|
153
|
+
requiresForEach: true,
|
|
154
|
+
reason: "Split outputs SplitDocument[] which requires forEach for processing.",
|
|
155
|
+
note: "forEach auto-unwraps SplitDocument.input for child flow."
|
|
156
|
+
},
|
|
157
|
+
output: {
|
|
158
|
+
valid: true,
|
|
159
|
+
note: "output node can follow any node to select or transform results"
|
|
160
|
+
}
|
|
161
|
+
},
|
|
162
|
+
categorize: {
|
|
163
|
+
parse: {
|
|
164
|
+
valid: true,
|
|
165
|
+
note: "categorize outputs {input, category}. Conditional can unwrap this or use directly."
|
|
166
|
+
},
|
|
167
|
+
split: {
|
|
168
|
+
valid: false,
|
|
169
|
+
reason: "Split requires FlowInput, but categorize outputs {input, category}.",
|
|
170
|
+
note: "Use conditional to unwrap and pass input field to split."
|
|
171
|
+
},
|
|
172
|
+
categorize: {
|
|
173
|
+
valid: true,
|
|
174
|
+
note: "Can chain categorize nodes for multi-level classification."
|
|
175
|
+
},
|
|
176
|
+
extract: {
|
|
177
|
+
valid: true,
|
|
178
|
+
note: "extract can process the categorized document."
|
|
179
|
+
},
|
|
180
|
+
chunk: {
|
|
181
|
+
valid: false,
|
|
182
|
+
reason: "Categorize wraps input as {input, category}. Chunk needs unwrapped DocumentIR.",
|
|
183
|
+
note: "Use conditional to unwrap input field before chunk."
|
|
184
|
+
},
|
|
185
|
+
combine: {
|
|
186
|
+
valid: false,
|
|
187
|
+
reason: "Categorize outputs single result {input, category}, not an array. Combine requires array input."
|
|
188
|
+
},
|
|
189
|
+
trigger: {
|
|
190
|
+
valid: true,
|
|
191
|
+
note: "trigger accepts any input type, including {input, category}"
|
|
192
|
+
},
|
|
193
|
+
output: {
|
|
194
|
+
valid: true,
|
|
195
|
+
note: "output node can follow any node to select or transform results"
|
|
196
|
+
}
|
|
197
|
+
},
|
|
198
|
+
extract: {
|
|
199
|
+
parse: {
|
|
200
|
+
valid: false,
|
|
201
|
+
reason: "Extract outputs typed JSON (terminal node). Cannot pipe JSON to parse.",
|
|
202
|
+
note: "Extract should be one of the last steps in a flow. Use combine if extracting in parallel."
|
|
203
|
+
},
|
|
204
|
+
split: {
|
|
205
|
+
valid: false,
|
|
206
|
+
reason: "Extract outputs typed JSON (terminal node). Cannot pipe JSON to split."
|
|
207
|
+
},
|
|
208
|
+
categorize: {
|
|
209
|
+
valid: false,
|
|
210
|
+
reason: "Extract outputs typed JSON (terminal node). Cannot pipe JSON to categorize."
|
|
211
|
+
},
|
|
212
|
+
extract: {
|
|
213
|
+
valid: false,
|
|
214
|
+
reason: "Extract outputs typed JSON (terminal node). Cannot chain extractions on JSON output.",
|
|
215
|
+
note: "If you need multi-step extraction, extract from DocumentIR/ChunkOutput in parallel, then combine."
|
|
216
|
+
},
|
|
217
|
+
chunk: {
|
|
218
|
+
valid: false,
|
|
219
|
+
reason: "Extract outputs typed JSON, not DocumentIR. Chunk expects DocumentIR input."
|
|
220
|
+
},
|
|
221
|
+
combine: {
|
|
222
|
+
valid: true,
|
|
223
|
+
note: "Use combine to merge parallel extraction results from forEach."
|
|
224
|
+
},
|
|
225
|
+
trigger: {
|
|
226
|
+
valid: true,
|
|
227
|
+
note: "trigger accepts any input type, including extracted JSON"
|
|
228
|
+
},
|
|
229
|
+
output: {
|
|
230
|
+
valid: true,
|
|
231
|
+
note: "output node can follow any node to select or transform results"
|
|
232
|
+
}
|
|
233
|
+
},
|
|
234
|
+
chunk: {
|
|
235
|
+
parse: {
|
|
236
|
+
valid: false,
|
|
237
|
+
reason: "Chunk outputs ChunkOutput (specialized type), not FlowInput. Parse expects FlowInput as input."
|
|
238
|
+
},
|
|
239
|
+
split: {
|
|
240
|
+
valid: false,
|
|
241
|
+
reason: "Chunk outputs ChunkOutput, incompatible with Split input (FlowInput)."
|
|
242
|
+
},
|
|
243
|
+
categorize: {
|
|
244
|
+
valid: false,
|
|
245
|
+
reason: "Chunk outputs ChunkOutput, incompatible with Categorize input (DocumentIR|FlowInput).",
|
|
246
|
+
note: "Categorize before chunking, not after."
|
|
247
|
+
},
|
|
248
|
+
extract: {
|
|
249
|
+
valid: true,
|
|
250
|
+
note: "extract has special handling for ChunkOutput - extracts data from chunks."
|
|
251
|
+
},
|
|
252
|
+
chunk: {
|
|
253
|
+
valid: false,
|
|
254
|
+
reason: "Cannot chain chunk operations. Chunk only once per document.",
|
|
255
|
+
note: "Different chunking strategies should be applied to the original DocumentIR, not to chunks."
|
|
256
|
+
},
|
|
257
|
+
combine: {
|
|
258
|
+
valid: false,
|
|
259
|
+
reason: "Chunk outputs ChunkOutput (specialized type), not an array type. Combine expects T[].",
|
|
260
|
+
note: "Use chunk on individual documents in forEach, then extract, then combine extractions."
|
|
261
|
+
},
|
|
262
|
+
trigger: {
|
|
263
|
+
valid: true,
|
|
264
|
+
note: "trigger accepts any input type, including ChunkOutput"
|
|
265
|
+
},
|
|
266
|
+
output: {
|
|
267
|
+
valid: true,
|
|
268
|
+
note: "output node can follow any node to select or transform results"
|
|
269
|
+
}
|
|
270
|
+
},
|
|
271
|
+
combine: {
|
|
272
|
+
parse: {
|
|
273
|
+
valid: true,
|
|
274
|
+
note: "After combining, result can be re-parsed if needed."
|
|
275
|
+
},
|
|
276
|
+
split: {
|
|
277
|
+
valid: false,
|
|
278
|
+
reason: "Combine output depends on strategy. Split requires FlowInput.",
|
|
279
|
+
note: "Most combine strategies output merged objects/arrays, not FlowInput."
|
|
280
|
+
},
|
|
281
|
+
categorize: {
|
|
282
|
+
valid: true,
|
|
283
|
+
note: "Can categorize combined results."
|
|
284
|
+
},
|
|
285
|
+
extract: {
|
|
286
|
+
valid: true,
|
|
287
|
+
note: "Can extract from combined results."
|
|
288
|
+
},
|
|
289
|
+
chunk: {
|
|
290
|
+
valid: true,
|
|
291
|
+
note: "Can chunk combined DocumentIR. Only valid if combine output is DocumentIR or DocumentIR[]."
|
|
292
|
+
},
|
|
293
|
+
combine: {
|
|
294
|
+
valid: false,
|
|
295
|
+
reason: "Cannot chain combine nodes. Combine once per forEach operation."
|
|
296
|
+
},
|
|
297
|
+
trigger: {
|
|
298
|
+
valid: true,
|
|
299
|
+
note: "trigger accepts any input type"
|
|
300
|
+
},
|
|
301
|
+
output: {
|
|
302
|
+
valid: true,
|
|
303
|
+
note: "output node can follow any node to select or transform results"
|
|
304
|
+
}
|
|
305
|
+
},
|
|
306
|
+
trigger: {
|
|
307
|
+
parse: {
|
|
308
|
+
valid: true,
|
|
309
|
+
requiresRuntimeValidation: true,
|
|
310
|
+
note: "Valid only if child flow returns FlowInput. Type safety cannot be guaranteed at build-time."
|
|
311
|
+
},
|
|
312
|
+
split: {
|
|
313
|
+
valid: true,
|
|
314
|
+
requiresRuntimeValidation: true,
|
|
315
|
+
note: "Valid only if child flow returns FlowInput. Type safety cannot be guaranteed at build-time."
|
|
316
|
+
},
|
|
317
|
+
categorize: {
|
|
318
|
+
valid: true,
|
|
319
|
+
requiresRuntimeValidation: true,
|
|
320
|
+
note: "Valid only if child flow returns DocumentIR or FlowInput. Type safety cannot be guaranteed at build-time."
|
|
321
|
+
},
|
|
322
|
+
extract: {
|
|
323
|
+
valid: true,
|
|
324
|
+
requiresRuntimeValidation: true,
|
|
325
|
+
note: "Valid only if child flow returns DocumentIR, FlowInput, or ChunkOutput. Type safety cannot be guaranteed at build-time."
|
|
326
|
+
},
|
|
327
|
+
chunk: {
|
|
328
|
+
valid: true,
|
|
329
|
+
requiresRuntimeValidation: true,
|
|
330
|
+
note: "Valid only if child flow returns DocumentIR or DocumentIR[]. Type safety cannot be guaranteed at build-time."
|
|
331
|
+
},
|
|
332
|
+
combine: {
|
|
333
|
+
valid: true,
|
|
334
|
+
requiresRuntimeValidation: true,
|
|
335
|
+
note: "Valid only if child flow returns an array (T[]). Type safety cannot be guaranteed at build-time."
|
|
336
|
+
},
|
|
337
|
+
trigger: {
|
|
338
|
+
valid: true,
|
|
339
|
+
requiresRuntimeValidation: true,
|
|
340
|
+
note: "Can nest trigger nodes (with circular dependency detection and max depth limits). Output type depends on nested child flow."
|
|
341
|
+
},
|
|
342
|
+
output: {
|
|
343
|
+
valid: true,
|
|
344
|
+
note: "output node can follow any node to select or transform results"
|
|
345
|
+
}
|
|
346
|
+
},
|
|
347
|
+
output: {
|
|
348
|
+
parse: {
|
|
349
|
+
valid: false,
|
|
350
|
+
reason: "Output is a terminal node that selects/transforms results. Cannot chain to other nodes."
|
|
351
|
+
},
|
|
352
|
+
split: {
|
|
353
|
+
valid: false,
|
|
354
|
+
reason: "Output is a terminal node that selects/transforms results. Cannot chain to other nodes."
|
|
355
|
+
},
|
|
356
|
+
categorize: {
|
|
357
|
+
valid: false,
|
|
358
|
+
reason: "Output is a terminal node that selects/transforms results. Cannot chain to other nodes."
|
|
359
|
+
},
|
|
360
|
+
extract: {
|
|
361
|
+
valid: false,
|
|
362
|
+
reason: "Output is a terminal node that selects/transforms results. Cannot chain to other nodes."
|
|
363
|
+
},
|
|
364
|
+
chunk: {
|
|
365
|
+
valid: false,
|
|
366
|
+
reason: "Output is a terminal node that selects/transforms results. Cannot chain to other nodes."
|
|
367
|
+
},
|
|
368
|
+
combine: {
|
|
369
|
+
valid: false,
|
|
370
|
+
reason: "Output is a terminal node that selects/transforms results. Cannot chain to other nodes."
|
|
371
|
+
},
|
|
372
|
+
trigger: {
|
|
373
|
+
valid: false,
|
|
374
|
+
reason: "Output is a terminal node that selects/transforms results. Cannot chain to other nodes."
|
|
375
|
+
},
|
|
376
|
+
output: {
|
|
377
|
+
valid: true,
|
|
378
|
+
note: "Multiple output nodes are allowed to create multiple named outputs from a flow."
|
|
379
|
+
}
|
|
380
|
+
}
|
|
381
|
+
};
|
|
382
|
+
function getNodeTypeName(node2) {
|
|
383
|
+
if (!node2 || !node2.key) return null;
|
|
384
|
+
const key = node2.key;
|
|
385
|
+
const knownTypes = ["parse", "split", "categorize", "extract", "chunk", "combine", "trigger", "output"];
|
|
386
|
+
return knownTypes.includes(key) ? key : null;
|
|
387
|
+
}
|
|
388
|
+
function getNodeTypeInfo(node2) {
|
|
389
|
+
return node2.__meta || null;
|
|
390
|
+
}
|
|
391
|
+
function getCompatibleTargets(sourceType, includeForEach = false) {
|
|
392
|
+
const rules = NODE_COMPATIBILITY_MATRIX[sourceType];
|
|
393
|
+
if (!rules) return [];
|
|
394
|
+
return Object.entries(rules).filter(([_, rule]) => {
|
|
395
|
+
if (!rule.valid) return false;
|
|
396
|
+
if (rule.requiresForEach && !includeForEach) return false;
|
|
397
|
+
return true;
|
|
398
|
+
}).map(([targetType, _]) => targetType);
|
|
399
|
+
}
|
|
400
|
+
function getSuggestedConnections(sourceType) {
|
|
401
|
+
const compatibleTargets = getCompatibleTargets(sourceType, false);
|
|
402
|
+
const forEachTargets = getCompatibleTargets(sourceType, true).filter(
|
|
403
|
+
(t) => !compatibleTargets.includes(t)
|
|
404
|
+
);
|
|
405
|
+
if (compatibleTargets.length === 0 && forEachTargets.length === 0) {
|
|
406
|
+
return [`${sourceType} has no standard outgoing connections (terminal node).`];
|
|
407
|
+
}
|
|
408
|
+
const suggestions = [];
|
|
409
|
+
if (compatibleTargets.length > 0) {
|
|
410
|
+
suggestions.push(`${sourceType} can connect to:`);
|
|
411
|
+
compatibleTargets.forEach((target) => {
|
|
412
|
+
const rule = NODE_COMPATIBILITY_MATRIX[sourceType][target];
|
|
413
|
+
suggestions.push(` \u2022 ${target}${rule.note ? ` - ${rule.note}` : ""}`);
|
|
414
|
+
});
|
|
415
|
+
}
|
|
416
|
+
if (forEachTargets.length > 0) {
|
|
417
|
+
suggestions.push(`${sourceType} can connect to (with forEach enabled):`);
|
|
418
|
+
forEachTargets.forEach((target) => {
|
|
419
|
+
const rule = NODE_COMPATIBILITY_MATRIX[sourceType][target];
|
|
420
|
+
suggestions.push(` \u2022 ${target}${rule.note ? ` - ${rule.note}` : ""}`);
|
|
421
|
+
});
|
|
422
|
+
}
|
|
423
|
+
return suggestions;
|
|
424
|
+
}
|
|
425
|
+
function validateNodeConnection(sourceType, targetType, forEachEnabled = false) {
|
|
426
|
+
const rule = NODE_COMPATIBILITY_MATRIX[sourceType]?.[targetType];
|
|
427
|
+
if (!rule) {
|
|
428
|
+
return {
|
|
429
|
+
valid: false,
|
|
430
|
+
reason: `Unknown node type combination: ${sourceType} \u2192 ${targetType}`,
|
|
431
|
+
suggestions: ["Ensure both nodes are valid node types."]
|
|
432
|
+
};
|
|
433
|
+
}
|
|
434
|
+
if (!rule.valid) {
|
|
435
|
+
return {
|
|
436
|
+
valid: false,
|
|
437
|
+
reason: rule.reason,
|
|
438
|
+
suggestions: getSuggestedConnections(sourceType)
|
|
439
|
+
};
|
|
440
|
+
}
|
|
441
|
+
if (rule.requiresForEach && !forEachEnabled) {
|
|
442
|
+
return {
|
|
443
|
+
valid: false,
|
|
444
|
+
reason: `Cannot connect ${sourceType} to ${targetType} without forEach enabled.`,
|
|
445
|
+
suggestions: [
|
|
446
|
+
`Enable forEach on the ${sourceType} node:`,
|
|
447
|
+
` 1. Click the ${sourceType} node`,
|
|
448
|
+
` 2. Enable "forEach Processing" in the configuration`,
|
|
449
|
+
` 3. Try connecting again`,
|
|
450
|
+
"",
|
|
451
|
+
...getSuggestedConnections(sourceType)
|
|
452
|
+
],
|
|
453
|
+
requiresForEach: true
|
|
454
|
+
};
|
|
455
|
+
}
|
|
456
|
+
if (rule.requiresRuntimeValidation) {
|
|
457
|
+
return {
|
|
458
|
+
valid: true,
|
|
459
|
+
warning: `\u26A0\uFE0F ${sourceType} \u2192 ${targetType}: ${rule.note || "Type compatibility depends on runtime values and cannot be validated at build-time."}`
|
|
460
|
+
};
|
|
461
|
+
}
|
|
462
|
+
return {
|
|
463
|
+
valid: true
|
|
464
|
+
};
|
|
465
|
+
}
|
|
466
|
+
function getValidForEachStarters(parentType) {
|
|
467
|
+
const rules = NODE_COMPATIBILITY_MATRIX[parentType];
|
|
468
|
+
if (!rules) return [];
|
|
469
|
+
return Object.entries(rules).filter(([_, rule]) => rule.valid && rule.requiresForEach).map(([targetType, _]) => targetType);
|
|
470
|
+
}
|
|
471
|
+
function canStartForEachItemFlow(parentType, starterType) {
|
|
472
|
+
const rule = NODE_COMPATIBILITY_MATRIX[parentType]?.[starterType];
|
|
473
|
+
if (!rule) {
|
|
474
|
+
return {
|
|
475
|
+
valid: false,
|
|
476
|
+
reason: `Unknown node type combination: ${parentType} \u2192 forEach \u2192 ${starterType}`,
|
|
477
|
+
suggestions: ["Ensure both nodes are valid node types."]
|
|
478
|
+
};
|
|
479
|
+
}
|
|
480
|
+
if (rule.valid && rule.requiresForEach) {
|
|
481
|
+
return {
|
|
482
|
+
valid: true
|
|
483
|
+
};
|
|
484
|
+
}
|
|
485
|
+
if (!rule.valid) {
|
|
486
|
+
const validStarters2 = getValidForEachStarters(parentType);
|
|
487
|
+
return {
|
|
488
|
+
valid: false,
|
|
489
|
+
reason: `${starterType} cannot start forEach itemFlow after ${parentType}. ${rule.reason || "Type incompatible with forEach unwrapped item."}`,
|
|
490
|
+
suggestions: validStarters2.length > 0 ? [`Valid itemFlow starters for ${parentType}: ${validStarters2.join(", ")}`] : [`${parentType} has no valid forEach itemFlow starters.`]
|
|
491
|
+
};
|
|
492
|
+
}
|
|
493
|
+
const validStarters = getValidForEachStarters(parentType);
|
|
494
|
+
return {
|
|
495
|
+
valid: false,
|
|
496
|
+
reason: `${starterType} cannot start forEach itemFlow after ${parentType}. This connection does not require forEach, meaning it expects the full array, not individual items.`,
|
|
497
|
+
suggestions: validStarters.length > 0 ? [`Valid itemFlow starters for ${parentType}: ${validStarters.join(", ")}`] : [`${parentType} has no valid forEach itemFlow starters.`]
|
|
498
|
+
};
|
|
499
|
+
}
|
|
500
|
+
function validateJson(data, schema) {
|
|
501
|
+
const errors = [];
|
|
502
|
+
const MAX_DEPTH = 50;
|
|
503
|
+
function validate(value, schema2, path = "", depth = 0) {
|
|
504
|
+
if (depth > MAX_DEPTH) {
|
|
505
|
+
errors.push(`${path || "root"}: maximum nesting depth (${MAX_DEPTH}) exceeded`);
|
|
506
|
+
return;
|
|
507
|
+
}
|
|
508
|
+
if (schema2.nullable && (value === null || value === void 0)) {
|
|
509
|
+
return;
|
|
510
|
+
}
|
|
511
|
+
if (value === null || value === void 0) {
|
|
512
|
+
if (schema2.nullable !== true) {
|
|
513
|
+
errors.push(`${path || "root"}: value is null or undefined`);
|
|
514
|
+
}
|
|
515
|
+
return;
|
|
516
|
+
}
|
|
517
|
+
const actualType = Array.isArray(value) ? "array" : typeof value;
|
|
518
|
+
const expectedType = schema2.type;
|
|
519
|
+
if (expectedType) {
|
|
520
|
+
if (expectedType === "integer") {
|
|
521
|
+
if (typeof value !== "number" || !Number.isInteger(value)) {
|
|
522
|
+
errors.push(`${path || "root"}: expected integer, got ${actualType}`);
|
|
523
|
+
return;
|
|
524
|
+
}
|
|
525
|
+
} else if (expectedType === "number") {
|
|
526
|
+
if (typeof value !== "number") {
|
|
527
|
+
errors.push(`${path || "root"}: expected number, got ${actualType}`);
|
|
528
|
+
return;
|
|
529
|
+
}
|
|
530
|
+
} else if (expectedType === "string") {
|
|
531
|
+
if (typeof value !== "string") {
|
|
532
|
+
errors.push(`${path || "root"}: expected string, got ${actualType}`);
|
|
533
|
+
return;
|
|
534
|
+
}
|
|
535
|
+
} else if (expectedType === "boolean") {
|
|
536
|
+
if (typeof value !== "boolean") {
|
|
537
|
+
errors.push(`${path || "root"}: expected boolean, got ${actualType}`);
|
|
538
|
+
return;
|
|
539
|
+
}
|
|
540
|
+
} else if (expectedType === "object") {
|
|
541
|
+
if (typeof value !== "object" || Array.isArray(value)) {
|
|
542
|
+
errors.push(`${path || "root"}: expected object, got ${actualType}`);
|
|
543
|
+
return;
|
|
544
|
+
}
|
|
545
|
+
if (schema2.required && Array.isArray(schema2.required)) {
|
|
546
|
+
for (const reqProp of schema2.required) {
|
|
547
|
+
if (!(reqProp in value)) {
|
|
548
|
+
errors.push(`${path}.${reqProp}: required property missing`);
|
|
549
|
+
}
|
|
550
|
+
}
|
|
551
|
+
}
|
|
552
|
+
const dangerousProps = ["__proto__", "constructor", "prototype"];
|
|
553
|
+
if (schema2.additionalProperties === false && schema2.properties) {
|
|
554
|
+
const allowedProps = Object.keys(schema2.properties);
|
|
555
|
+
const requiredProps = schema2.required || [];
|
|
556
|
+
const allAllowedProps = /* @__PURE__ */ new Set([...allowedProps, ...requiredProps]);
|
|
557
|
+
for (const key of [...Object.keys(value), ...Object.getOwnPropertyNames(value)]) {
|
|
558
|
+
if (dangerousProps.includes(key)) {
|
|
559
|
+
errors.push(`${path}.${key}: dangerous property not allowed`);
|
|
560
|
+
continue;
|
|
561
|
+
}
|
|
562
|
+
if (!allAllowedProps.has(key)) {
|
|
563
|
+
errors.push(`${path}.${key}: additional property not allowed`);
|
|
564
|
+
}
|
|
565
|
+
}
|
|
566
|
+
} else {
|
|
567
|
+
for (const key of dangerousProps) {
|
|
568
|
+
if (key in value && Object.prototype.hasOwnProperty.call(value, key)) {
|
|
569
|
+
errors.push(`${path}.${key}: dangerous property not allowed`);
|
|
570
|
+
}
|
|
571
|
+
}
|
|
572
|
+
}
|
|
573
|
+
if (schema2.properties) {
|
|
574
|
+
const valueObj = value;
|
|
575
|
+
for (const [propName, propSchema] of Object.entries(schema2.properties)) {
|
|
576
|
+
if (propName in valueObj) {
|
|
577
|
+
validate(valueObj[propName], propSchema, path ? `${path}.${propName}` : propName, depth + 1);
|
|
578
|
+
}
|
|
579
|
+
}
|
|
580
|
+
}
|
|
581
|
+
} else if (expectedType === "array") {
|
|
582
|
+
if (!Array.isArray(value)) {
|
|
583
|
+
errors.push(`${path || "root"}: expected array, got ${actualType}`);
|
|
584
|
+
return;
|
|
585
|
+
}
|
|
586
|
+
if (schema2.items && !Array.isArray(schema2.items)) {
|
|
587
|
+
const itemSchema = schema2.items;
|
|
588
|
+
value.forEach((item, index) => {
|
|
589
|
+
validate(item, itemSchema, `${path}[${index}]`, depth + 1);
|
|
590
|
+
});
|
|
591
|
+
}
|
|
592
|
+
}
|
|
593
|
+
}
|
|
594
|
+
}
|
|
595
|
+
validate(data, schema);
|
|
596
|
+
if (errors.length > 0) {
|
|
597
|
+
throw new Error(`Schema validation failed:
|
|
598
|
+
${errors.join("\n")}`);
|
|
599
|
+
}
|
|
600
|
+
return data;
|
|
601
|
+
}
|
|
602
|
+
var RESERVED_VARIABLES = {
|
|
603
|
+
extract: ["schema", "documentText", "schemaTitle", "schemaDescription", "structuredFormat"],
|
|
604
|
+
categorize: ["categories", "documentText"],
|
|
605
|
+
parse: ["format", "schema", "describeFigures", "citationsEnabled"]
|
|
606
|
+
};
|
|
607
|
+
function protectReservedVariables(nodeType, userVariables, autoInjectedVariables) {
|
|
608
|
+
if (!userVariables || Object.keys(userVariables).length === 0) {
|
|
609
|
+
return autoInjectedVariables;
|
|
610
|
+
}
|
|
611
|
+
const reserved = RESERVED_VARIABLES[nodeType];
|
|
612
|
+
const warnings = [];
|
|
613
|
+
for (const key of reserved) {
|
|
614
|
+
if (key in userVariables) {
|
|
615
|
+
warnings.push(key);
|
|
616
|
+
}
|
|
617
|
+
}
|
|
618
|
+
if (warnings.length > 0) {
|
|
619
|
+
console.warn(
|
|
620
|
+
`[doclo] Attempted to override reserved variables in ${nodeType} node: ${warnings.join(", ")}. These variables are auto-injected from config and cannot be overridden. They will be ignored.`
|
|
621
|
+
);
|
|
622
|
+
}
|
|
623
|
+
return {
|
|
624
|
+
...autoInjectedVariables,
|
|
625
|
+
...userVariables,
|
|
626
|
+
// Restore reserved variables to ensure they can't be overridden
|
|
627
|
+
...Object.fromEntries(
|
|
628
|
+
reserved.map((key) => [key, autoInjectedVariables[key]])
|
|
629
|
+
)
|
|
630
|
+
};
|
|
631
|
+
}
|
|
632
|
+
|
|
633
|
+
// src/security/url-validator.ts
|
|
634
|
+
var BLOCKED_IP_RANGES = [
|
|
635
|
+
// Loopback
|
|
636
|
+
{ start: "127.0.0.0", end: "127.255.255.255" },
|
|
637
|
+
// Private Class A
|
|
638
|
+
{ start: "10.0.0.0", end: "10.255.255.255" },
|
|
639
|
+
// Private Class B
|
|
640
|
+
{ start: "172.16.0.0", end: "172.31.255.255" },
|
|
641
|
+
// Private Class C
|
|
642
|
+
{ start: "192.168.0.0", end: "192.168.255.255" },
|
|
643
|
+
// Link Local
|
|
644
|
+
{ start: "169.254.0.0", end: "169.254.255.255" }
|
|
645
|
+
];
|
|
646
|
+
var BLOCKED_METADATA_HOSTS = [
|
|
647
|
+
"169.254.169.254",
|
|
648
|
+
// AWS metadata service
|
|
649
|
+
"169.254.169.253",
|
|
650
|
+
// AWS metadata service (Windows)
|
|
651
|
+
"metadata.google.internal",
|
|
652
|
+
// GCP metadata service
|
|
653
|
+
"metadata",
|
|
654
|
+
// GCP alias
|
|
655
|
+
"100.100.100.200",
|
|
656
|
+
// Aliyun metadata service
|
|
657
|
+
"instance-data"
|
|
658
|
+
// OpenStack alias
|
|
659
|
+
];
|
|
660
|
+
var BLOCKED_IPV6_PATTERNS = [
|
|
661
|
+
/^::1$/,
|
|
662
|
+
// Loopback (::1)
|
|
663
|
+
/^::$/,
|
|
664
|
+
// Any address (::)
|
|
665
|
+
/^::ffff:/i,
|
|
666
|
+
// IPv4-mapped IPv6 (::ffff:0:0/96) - matches ::ffff:127.0.0.1
|
|
667
|
+
/^::ffff:0:/i,
|
|
668
|
+
// IPv4-mapped IPv6 alternative
|
|
669
|
+
/^fe80:/i,
|
|
670
|
+
// Link-local (fe80::/10)
|
|
671
|
+
/^fec0:/i,
|
|
672
|
+
// Site-local deprecated (fec0::/10)
|
|
673
|
+
/^fc00:/i,
|
|
674
|
+
// Unique local address (fc00::/7)
|
|
675
|
+
/^fd00:/i,
|
|
676
|
+
// Unique local address (fd00::/8)
|
|
677
|
+
/^ff00:/i,
|
|
678
|
+
// Multicast (ff00::/8)
|
|
679
|
+
/^0:0:0:0:0:0:0:1$/i
|
|
680
|
+
// Loopback expanded form
|
|
681
|
+
];
|
|
682
|
+
function ipToNumber(ip) {
|
|
683
|
+
const parts = ip.split(".").map(Number);
|
|
684
|
+
if (parts.length !== 4 || parts.some((p) => p < 0 || p > 255)) {
|
|
685
|
+
return -1;
|
|
686
|
+
}
|
|
687
|
+
return (parts[0] << 24) + (parts[1] << 16) + (parts[2] << 8) + parts[3];
|
|
688
|
+
}
|
|
689
|
+
function isIpInBlockedRange(ip) {
|
|
690
|
+
const ipNum = ipToNumber(ip);
|
|
691
|
+
if (ipNum === -1) return false;
|
|
692
|
+
return BLOCKED_IP_RANGES.some((range) => {
|
|
693
|
+
const startNum = ipToNumber(range.start);
|
|
694
|
+
const endNum = ipToNumber(range.end);
|
|
695
|
+
return ipNum >= startNum && ipNum <= endNum;
|
|
696
|
+
});
|
|
697
|
+
}
|
|
698
|
+
function isIPv6Blocked(hostname) {
|
|
699
|
+
const addr = hostname.replace(/^\[|\]$/g, "");
|
|
700
|
+
return BLOCKED_IPV6_PATTERNS.some((pattern) => pattern.test(addr));
|
|
701
|
+
}
|
|
702
|
+
function validateUrl(urlString, options = {}) {
|
|
703
|
+
const {
|
|
704
|
+
blockInternal = true,
|
|
705
|
+
allowedProtocols = ["http:", "https:"]
|
|
706
|
+
} = options;
|
|
707
|
+
let url;
|
|
708
|
+
try {
|
|
709
|
+
url = new URL(urlString);
|
|
710
|
+
} catch (error) {
|
|
711
|
+
throw new Error(`Invalid URL: ${urlString}`);
|
|
712
|
+
}
|
|
713
|
+
if (!allowedProtocols.includes(url.protocol)) {
|
|
714
|
+
throw new Error(
|
|
715
|
+
`Blocked protocol: ${url.protocol}. Allowed: ${allowedProtocols.join(", ")}`
|
|
716
|
+
);
|
|
717
|
+
}
|
|
718
|
+
if (blockInternal) {
|
|
719
|
+
const hostname = url.hostname;
|
|
720
|
+
if (BLOCKED_METADATA_HOSTS.includes(hostname)) {
|
|
721
|
+
throw new Error(`Blocked metadata service: ${hostname}`);
|
|
722
|
+
}
|
|
723
|
+
if (hostname.includes(":") || hostname.startsWith("[")) {
|
|
724
|
+
if (isIPv6Blocked(hostname)) {
|
|
725
|
+
throw new Error(`Blocked IPv6 address: ${hostname}`);
|
|
726
|
+
}
|
|
727
|
+
}
|
|
728
|
+
if (isIpInBlockedRange(hostname)) {
|
|
729
|
+
throw new Error(`Blocked internal IP address: ${hostname}`);
|
|
730
|
+
}
|
|
731
|
+
if (hostname === "localhost") {
|
|
732
|
+
throw new Error("Blocked localhost access");
|
|
733
|
+
}
|
|
734
|
+
}
|
|
735
|
+
return url;
|
|
736
|
+
}
|
|
737
|
+
|
|
738
|
+
// src/security/resource-limits.ts
|
|
739
|
+
var DEFAULT_LIMITS = {
|
|
740
|
+
// Maximum file size: 100MB
|
|
741
|
+
MAX_FILE_SIZE: 100 * 1024 * 1024,
|
|
742
|
+
// Request timeout: 30 seconds
|
|
743
|
+
REQUEST_TIMEOUT: 3e4,
|
|
744
|
+
// Maximum JSON parse depth
|
|
745
|
+
MAX_JSON_DEPTH: 100
|
|
746
|
+
};
|
|
747
|
+
function validateFileSize(size, maxSize = DEFAULT_LIMITS.MAX_FILE_SIZE) {
|
|
748
|
+
if (size > maxSize) {
|
|
749
|
+
const maxMB = Math.round(maxSize / 1024 / 1024);
|
|
750
|
+
const sizeMB = Math.round(size / 1024 / 1024);
|
|
751
|
+
throw new Error(
|
|
752
|
+
`File size ${sizeMB}MB exceeds maximum allowed size of ${maxMB}MB`
|
|
753
|
+
);
|
|
754
|
+
}
|
|
755
|
+
}
|
|
756
|
+
function createFetchController(timeoutMs = DEFAULT_LIMITS.REQUEST_TIMEOUT) {
|
|
757
|
+
const controller = new AbortController();
|
|
758
|
+
const timeoutId = setTimeout(() => controller.abort(), timeoutMs);
|
|
759
|
+
controller.__timeoutId = timeoutId;
|
|
760
|
+
return controller;
|
|
761
|
+
}
|
|
762
|
+
function cleanupFetchController(controller) {
|
|
763
|
+
const timeoutId = controller.__timeoutId;
|
|
764
|
+
if (timeoutId) {
|
|
765
|
+
clearTimeout(timeoutId);
|
|
766
|
+
}
|
|
767
|
+
}
|
|
768
|
+
async function fetchWithTimeout(url, options = {}, timeoutMs = DEFAULT_LIMITS.REQUEST_TIMEOUT) {
|
|
769
|
+
const controller = createFetchController(timeoutMs);
|
|
770
|
+
try {
|
|
771
|
+
const response = await fetch(url, {
|
|
772
|
+
...options,
|
|
773
|
+
signal: controller.signal,
|
|
774
|
+
cache: "no-store"
|
|
775
|
+
// Prevent Next.js cache revalidation which can cause AbortError (see: github.com/vercel/next.js/issues/54045)
|
|
776
|
+
});
|
|
777
|
+
return response;
|
|
778
|
+
} finally {
|
|
779
|
+
cleanupFetchController(controller);
|
|
780
|
+
}
|
|
781
|
+
}
|
|
782
|
+
|
|
783
|
+
// src/runtime/base64.ts
|
|
784
|
+
function arrayBufferToBase64(buffer) {
|
|
785
|
+
if (typeof Buffer !== "undefined") {
|
|
786
|
+
return Buffer.from(buffer).toString("base64");
|
|
787
|
+
}
|
|
788
|
+
const bytes = new Uint8Array(buffer);
|
|
789
|
+
let binary = "";
|
|
790
|
+
for (let i = 0; i < bytes.byteLength; i++) {
|
|
791
|
+
binary += String.fromCharCode(bytes[i]);
|
|
792
|
+
}
|
|
793
|
+
return btoa(binary);
|
|
794
|
+
}
|
|
795
|
+
function base64ToArrayBuffer(base64) {
|
|
796
|
+
const cleanBase64 = base64.replace(/^data:[^;]+;base64,/, "");
|
|
797
|
+
if (typeof Buffer !== "undefined") {
|
|
798
|
+
const buffer = Buffer.from(cleanBase64, "base64");
|
|
799
|
+
return buffer.buffer.slice(buffer.byteOffset, buffer.byteOffset + buffer.byteLength);
|
|
800
|
+
}
|
|
801
|
+
const binaryString = atob(cleanBase64);
|
|
802
|
+
const bytes = new Uint8Array(binaryString.length);
|
|
803
|
+
for (let i = 0; i < binaryString.length; i++) {
|
|
804
|
+
bytes[i] = binaryString.charCodeAt(i);
|
|
805
|
+
}
|
|
806
|
+
return bytes.buffer;
|
|
807
|
+
}
|
|
808
|
+
function uint8ArrayToBase64(bytes) {
|
|
809
|
+
return arrayBufferToBase64(bytes.buffer.slice(bytes.byteOffset, bytes.byteOffset + bytes.byteLength));
|
|
810
|
+
}
|
|
811
|
+
function createDataUri(buffer, mimeType = "application/octet-stream") {
|
|
812
|
+
const base64 = arrayBufferToBase64(buffer);
|
|
813
|
+
return `data:${mimeType};base64,${base64}`;
|
|
814
|
+
}
|
|
815
|
+
|
|
816
|
+
// src/mime-detection.ts
|
|
817
|
+
import { fileTypeFromBuffer } from "file-type";
|
|
818
|
+
async function detectMimeTypeFromBase64Async(base64Data) {
|
|
819
|
+
const base64Only = base64Data.includes(",") ? base64Data.split(",")[1] : base64Data;
|
|
820
|
+
const binaryString = atob(base64Only);
|
|
821
|
+
const bytes = new Uint8Array(binaryString.length);
|
|
822
|
+
for (let i = 0; i < binaryString.length; i++) {
|
|
823
|
+
bytes[i] = binaryString.charCodeAt(i);
|
|
824
|
+
}
|
|
825
|
+
const result = await fileTypeFromBuffer(bytes);
|
|
826
|
+
if (result) {
|
|
827
|
+
return result.mime;
|
|
828
|
+
}
|
|
829
|
+
throw new Error(
|
|
830
|
+
`Unsupported file format. Magic bytes: ${Array.from(bytes.slice(0, 4)).map((b) => b.toString(16).padStart(2, "0")).join(" ")}`
|
|
831
|
+
);
|
|
832
|
+
}
|
|
833
|
+
function detectMimeTypeFromBase64(base64Data) {
|
|
834
|
+
const base64Only = base64Data.includes(",") ? base64Data.split(",")[1] : base64Data;
|
|
835
|
+
const binaryString = atob(base64Only.substring(0, 24));
|
|
836
|
+
const bytes = new Uint8Array(binaryString.length);
|
|
837
|
+
for (let i = 0; i < binaryString.length; i++) {
|
|
838
|
+
bytes[i] = binaryString.charCodeAt(i);
|
|
839
|
+
}
|
|
840
|
+
return detectMimeTypeFromBytes(bytes);
|
|
841
|
+
}
|
|
842
|
+
function detectMimeTypeFromBytes(bytes) {
|
|
843
|
+
if (bytes.length < 4) {
|
|
844
|
+
throw new Error("Insufficient data to detect MIME type (need at least 4 bytes)");
|
|
845
|
+
}
|
|
846
|
+
if (bytes[0] === 255 && bytes[1] === 216 && bytes[2] === 255) {
|
|
847
|
+
return "image/jpeg";
|
|
848
|
+
}
|
|
849
|
+
if (bytes[0] === 137 && bytes[1] === 80 && bytes[2] === 78 && bytes[3] === 71) {
|
|
850
|
+
return "image/png";
|
|
851
|
+
}
|
|
852
|
+
if (bytes[0] === 71 && bytes[1] === 73 && bytes[2] === 70 && bytes[3] === 56) {
|
|
853
|
+
return "image/gif";
|
|
854
|
+
}
|
|
855
|
+
if (bytes.length >= 12 && bytes[0] === 82 && bytes[1] === 73 && bytes[2] === 70 && bytes[3] === 70 && bytes[8] === 87 && bytes[9] === 69 && bytes[10] === 66 && bytes[11] === 80) {
|
|
856
|
+
return "image/webp";
|
|
857
|
+
}
|
|
858
|
+
if (bytes[0] === 37 && bytes[1] === 80 && bytes[2] === 68 && bytes[3] === 70) {
|
|
859
|
+
return "application/pdf";
|
|
860
|
+
}
|
|
861
|
+
if (bytes[0] === 73 && bytes[1] === 73 && bytes[2] === 42 && bytes[3] === 0 || bytes[0] === 77 && bytes[1] === 77 && bytes[2] === 0 && bytes[3] === 42) {
|
|
862
|
+
return "image/tiff";
|
|
863
|
+
}
|
|
864
|
+
if (bytes[0] === 66 && bytes[1] === 77) {
|
|
865
|
+
return "image/bmp";
|
|
866
|
+
}
|
|
867
|
+
if (bytes[0] === 123 && bytes[1] === 92 && bytes[2] === 114 && bytes[3] === 116 && bytes[4] === 102) {
|
|
868
|
+
return "application/rtf";
|
|
869
|
+
}
|
|
870
|
+
if (bytes[0] === 80 && bytes[1] === 75 && bytes[2] === 3 && bytes[3] === 4) {
|
|
871
|
+
return "application/zip";
|
|
872
|
+
}
|
|
873
|
+
if (bytes.length >= 8 && bytes[0] === 208 && bytes[1] === 207 && bytes[2] === 17 && bytes[3] === 224 && bytes[4] === 161 && bytes[5] === 177 && bytes[6] === 26 && bytes[7] === 225) {
|
|
874
|
+
return "application/x-cfb";
|
|
875
|
+
}
|
|
876
|
+
throw new Error(
|
|
877
|
+
`Unsupported file format. Magic bytes: ${Array.from(bytes.slice(0, 4)).map((b) => b.toString(16).padStart(2, "0")).join(" ")}`
|
|
878
|
+
);
|
|
879
|
+
}
|
|
880
|
+
function validateMimeType(base64Data, declaredMimeType) {
|
|
881
|
+
const actualMimeType = detectMimeTypeFromBase64(base64Data);
|
|
882
|
+
return {
|
|
883
|
+
isValid: actualMimeType === declaredMimeType,
|
|
884
|
+
actualMimeType,
|
|
885
|
+
declaredMimeType
|
|
886
|
+
};
|
|
887
|
+
}
|
|
888
|
+
async function validateMimeTypeAsync(base64Data, declaredMimeType) {
|
|
889
|
+
const actualMimeType = await detectMimeTypeFromBase64Async(base64Data);
|
|
890
|
+
return {
|
|
891
|
+
isValid: actualMimeType === declaredMimeType,
|
|
892
|
+
actualMimeType,
|
|
893
|
+
declaredMimeType
|
|
894
|
+
};
|
|
895
|
+
}
|
|
896
|
+
function extractBase64(data) {
|
|
897
|
+
if (data.startsWith("data:")) {
|
|
898
|
+
const commaIndex = data.indexOf(",");
|
|
899
|
+
if (commaIndex === -1) {
|
|
900
|
+
throw new Error("Invalid data URI: missing comma separator");
|
|
901
|
+
}
|
|
902
|
+
return data.substring(commaIndex + 1);
|
|
903
|
+
}
|
|
904
|
+
return data;
|
|
905
|
+
}
|
|
906
|
+
|
|
907
|
+
// src/internal/file-utils.ts
|
|
908
|
+
function detectInputType(input) {
|
|
909
|
+
if (input.startsWith("data:")) return "data-uri";
|
|
910
|
+
if (input.startsWith("http://") || input.startsWith("https://")) return "url";
|
|
911
|
+
throw new Error(
|
|
912
|
+
'Edge Runtime does not support file paths. Use HTTP URLs, data URIs, or pass ArrayBuffer/base64 data directly.\nExample: await resolveDocument("https://example.com/doc.pdf") or resolveDocument("data:application/pdf;base64,...")'
|
|
913
|
+
);
|
|
914
|
+
}
|
|
915
|
+
function detectMimeType(input, contentType) {
|
|
916
|
+
if (input.startsWith("data:")) {
|
|
917
|
+
const match = input.match(/^data:([^;,]+)/);
|
|
918
|
+
if (match) return match[1];
|
|
919
|
+
}
|
|
920
|
+
if (contentType) {
|
|
921
|
+
const match = contentType.match(/^([^;]+)/);
|
|
922
|
+
if (match) return match[1].trim();
|
|
923
|
+
}
|
|
924
|
+
const lower = input.toLowerCase();
|
|
925
|
+
if (lower.endsWith(".pdf") || lower.includes(".pdf?")) return "application/pdf";
|
|
926
|
+
if (lower.endsWith(".png") || lower.includes(".png?")) return "image/png";
|
|
927
|
+
if (lower.endsWith(".webp") || lower.includes(".webp?")) return "image/webp";
|
|
928
|
+
if (lower.endsWith(".jpg") || lower.includes(".jpg?")) return "image/jpeg";
|
|
929
|
+
if (lower.endsWith(".jpeg") || lower.includes(".jpeg?")) return "image/jpeg";
|
|
930
|
+
if (lower.endsWith(".gif") || lower.includes(".gif?")) return "image/gif";
|
|
931
|
+
if (lower.endsWith(".tiff") || lower.includes(".tiff?")) return "image/tiff";
|
|
932
|
+
if (lower.endsWith(".tif") || lower.includes(".tif?")) return "image/tiff";
|
|
933
|
+
if (lower.endsWith(".bmp") || lower.includes(".bmp?")) return "image/bmp";
|
|
934
|
+
if (lower.endsWith(".heic") || lower.includes(".heic?")) return "image/heic";
|
|
935
|
+
if (lower.endsWith(".heif") || lower.includes(".heif?")) return "image/heif";
|
|
936
|
+
if (lower.endsWith(".psd") || lower.includes(".psd?")) return "image/vnd.adobe.photoshop";
|
|
937
|
+
if (lower.endsWith(".doc") || lower.includes(".doc?")) return "application/msword";
|
|
938
|
+
if (lower.endsWith(".docx") || lower.includes(".docx?")) return "application/vnd.openxmlformats-officedocument.wordprocessingml.document";
|
|
939
|
+
if (lower.endsWith(".xls") || lower.includes(".xls?")) return "application/vnd.ms-excel";
|
|
940
|
+
if (lower.endsWith(".xlsx") || lower.includes(".xlsx?")) return "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet";
|
|
941
|
+
if (lower.endsWith(".ppt") || lower.includes(".ppt?")) return "application/vnd.ms-powerpoint";
|
|
942
|
+
if (lower.endsWith(".pptx") || lower.includes(".pptx?")) return "application/vnd.openxmlformats-officedocument.presentationml.presentation";
|
|
943
|
+
if (lower.endsWith(".odt") || lower.includes(".odt?")) return "application/vnd.oasis.opendocument.text";
|
|
944
|
+
if (lower.endsWith(".ods") || lower.includes(".ods?")) return "application/vnd.oasis.opendocument.spreadsheet";
|
|
945
|
+
if (lower.endsWith(".odp") || lower.includes(".odp?")) return "application/vnd.oasis.opendocument.presentation";
|
|
946
|
+
if (lower.endsWith(".txt") || lower.includes(".txt?")) return "text/plain";
|
|
947
|
+
if (lower.endsWith(".csv") || lower.includes(".csv?")) return "text/csv";
|
|
948
|
+
if (lower.endsWith(".html") || lower.includes(".html?")) return "text/html";
|
|
949
|
+
if (lower.endsWith(".htm") || lower.includes(".htm?")) return "text/html";
|
|
950
|
+
if (lower.endsWith(".rtf") || lower.includes(".rtf?")) return "application/rtf";
|
|
951
|
+
if (lower.endsWith(".epub") || lower.includes(".epub?")) return "application/epub+zip";
|
|
952
|
+
return "application/octet-stream";
|
|
953
|
+
}
|
|
954
|
+
var KNOWN_MIME_TYPES = [
|
|
955
|
+
// PDF
|
|
956
|
+
"application/pdf",
|
|
957
|
+
// Images - common
|
|
958
|
+
"image/jpeg",
|
|
959
|
+
"image/png",
|
|
960
|
+
"image/gif",
|
|
961
|
+
"image/webp",
|
|
962
|
+
// Images - additional
|
|
963
|
+
"image/tiff",
|
|
964
|
+
"image/bmp",
|
|
965
|
+
"image/heic",
|
|
966
|
+
"image/heif",
|
|
967
|
+
"image/vnd.adobe.photoshop",
|
|
968
|
+
// Microsoft Office
|
|
969
|
+
"application/msword",
|
|
970
|
+
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
|
971
|
+
"application/vnd.ms-excel",
|
|
972
|
+
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
|
973
|
+
"application/vnd.ms-powerpoint",
|
|
974
|
+
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
|
975
|
+
// OpenDocument formats
|
|
976
|
+
"application/vnd.oasis.opendocument.text",
|
|
977
|
+
"application/vnd.oasis.opendocument.spreadsheet",
|
|
978
|
+
"application/vnd.oasis.opendocument.presentation",
|
|
979
|
+
// Text formats
|
|
980
|
+
"text/plain",
|
|
981
|
+
"text/csv",
|
|
982
|
+
"text/html",
|
|
983
|
+
"application/rtf",
|
|
984
|
+
// Other
|
|
985
|
+
"application/epub+zip"
|
|
986
|
+
];
|
|
987
|
+
var EXTENSION_TO_MIME = {
|
|
988
|
+
// PDF
|
|
989
|
+
".pdf": "application/pdf",
|
|
990
|
+
// Images - common
|
|
991
|
+
".jpg": "image/jpeg",
|
|
992
|
+
".jpeg": "image/jpeg",
|
|
993
|
+
".png": "image/png",
|
|
994
|
+
".gif": "image/gif",
|
|
995
|
+
".webp": "image/webp",
|
|
996
|
+
// Images - additional
|
|
997
|
+
".tiff": "image/tiff",
|
|
998
|
+
".tif": "image/tiff",
|
|
999
|
+
".bmp": "image/bmp",
|
|
1000
|
+
".heic": "image/heic",
|
|
1001
|
+
".heif": "image/heif",
|
|
1002
|
+
".psd": "image/vnd.adobe.photoshop",
|
|
1003
|
+
// Microsoft Office
|
|
1004
|
+
".doc": "application/msword",
|
|
1005
|
+
".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
|
1006
|
+
".xls": "application/vnd.ms-excel",
|
|
1007
|
+
".xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
|
1008
|
+
".ppt": "application/vnd.ms-powerpoint",
|
|
1009
|
+
".pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
|
1010
|
+
// OpenDocument formats
|
|
1011
|
+
".odt": "application/vnd.oasis.opendocument.text",
|
|
1012
|
+
".ods": "application/vnd.oasis.opendocument.spreadsheet",
|
|
1013
|
+
".odp": "application/vnd.oasis.opendocument.presentation",
|
|
1014
|
+
// Text formats
|
|
1015
|
+
".txt": "text/plain",
|
|
1016
|
+
".csv": "text/csv",
|
|
1017
|
+
".html": "text/html",
|
|
1018
|
+
".htm": "text/html",
|
|
1019
|
+
".rtf": "application/rtf",
|
|
1020
|
+
// Other
|
|
1021
|
+
".epub": "application/epub+zip"
|
|
1022
|
+
};
|
|
1023
|
+
function getExtensionFromPath(path) {
|
|
1024
|
+
const pathWithoutQuery = path.split("?")[0];
|
|
1025
|
+
const lastDot = pathWithoutQuery.lastIndexOf(".");
|
|
1026
|
+
if (lastDot === -1) return null;
|
|
1027
|
+
return pathWithoutQuery.slice(lastDot).toLowerCase();
|
|
1028
|
+
}
|
|
1029
|
+
function detectDocumentType(input) {
|
|
1030
|
+
if (!input) return "unknown";
|
|
1031
|
+
if (input.startsWith("data:")) {
|
|
1032
|
+
const match = input.match(/^data:([^;,]+)/);
|
|
1033
|
+
if (match) {
|
|
1034
|
+
const mimeType = match[1];
|
|
1035
|
+
if (KNOWN_MIME_TYPES.includes(mimeType)) {
|
|
1036
|
+
return mimeType;
|
|
1037
|
+
}
|
|
1038
|
+
}
|
|
1039
|
+
}
|
|
1040
|
+
if (!input.startsWith("data:")) {
|
|
1041
|
+
let ext = null;
|
|
1042
|
+
try {
|
|
1043
|
+
const url = new URL(input);
|
|
1044
|
+
ext = getExtensionFromPath(url.pathname);
|
|
1045
|
+
} catch {
|
|
1046
|
+
ext = getExtensionFromPath(input);
|
|
1047
|
+
}
|
|
1048
|
+
if (ext && ext in EXTENSION_TO_MIME) {
|
|
1049
|
+
return EXTENSION_TO_MIME[ext];
|
|
1050
|
+
}
|
|
1051
|
+
}
|
|
1052
|
+
try {
|
|
1053
|
+
const mimeType = detectMimeTypeFromBase64(input);
|
|
1054
|
+
if (KNOWN_MIME_TYPES.includes(mimeType)) {
|
|
1055
|
+
return mimeType;
|
|
1056
|
+
}
|
|
1057
|
+
} catch {
|
|
1058
|
+
}
|
|
1059
|
+
return "unknown";
|
|
1060
|
+
}
|
|
1061
|
+
function isPDFDocument(input) {
|
|
1062
|
+
return detectDocumentType(input) === "application/pdf";
|
|
1063
|
+
}
|
|
1064
|
+
async function resolveDocument(input, limits) {
|
|
1065
|
+
const inputType = detectInputType(input);
|
|
1066
|
+
switch (inputType) {
|
|
1067
|
+
case "data-uri":
|
|
1068
|
+
if (!input.match(/^data:[^;,]+;base64,/)) {
|
|
1069
|
+
throw new Error("Invalid data URI format. Expected: data:<mimetype>;base64,<data>");
|
|
1070
|
+
}
|
|
1071
|
+
return input;
|
|
1072
|
+
case "url":
|
|
1073
|
+
try {
|
|
1074
|
+
validateUrl(input);
|
|
1075
|
+
const timeout = limits?.requestTimeout ?? DEFAULT_LIMITS.REQUEST_TIMEOUT;
|
|
1076
|
+
const response = await fetchWithTimeout(input, {}, timeout);
|
|
1077
|
+
if (!response.ok) {
|
|
1078
|
+
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
|
|
1079
|
+
}
|
|
1080
|
+
const contentLength = response.headers.get("content-length");
|
|
1081
|
+
if (contentLength) {
|
|
1082
|
+
const maxSize2 = limits?.maxFileSize ?? DEFAULT_LIMITS.MAX_FILE_SIZE;
|
|
1083
|
+
validateFileSize(parseInt(contentLength, 10), maxSize2);
|
|
1084
|
+
}
|
|
1085
|
+
const arrayBuffer = await response.arrayBuffer();
|
|
1086
|
+
const maxSize = limits?.maxFileSize ?? DEFAULT_LIMITS.MAX_FILE_SIZE;
|
|
1087
|
+
validateFileSize(arrayBuffer.byteLength, maxSize);
|
|
1088
|
+
const base64 = arrayBufferToBase64(arrayBuffer);
|
|
1089
|
+
const mimeType = detectMimeType(input, response.headers.get("content-type") || void 0);
|
|
1090
|
+
return `data:${mimeType};base64,${base64}`;
|
|
1091
|
+
} catch (error) {
|
|
1092
|
+
throw new Error(`Failed to fetch URL ${input}: ${error.message}`);
|
|
1093
|
+
}
|
|
1094
|
+
}
|
|
1095
|
+
}
|
|
1096
|
+
function bufferToDataUri(buffer, mimeType) {
|
|
1097
|
+
if (buffer instanceof Uint8Array) {
|
|
1098
|
+
const arrayBuffer = buffer.buffer.slice(buffer.byteOffset, buffer.byteOffset + buffer.byteLength);
|
|
1099
|
+
return createDataUri(arrayBuffer, mimeType);
|
|
1100
|
+
}
|
|
1101
|
+
return createDataUri(buffer, mimeType);
|
|
1102
|
+
}
|
|
1103
|
+
function bufferToBase64(buffer, mimeType) {
|
|
1104
|
+
return bufferToDataUri(buffer, mimeType);
|
|
1105
|
+
}
|
|
1106
|
+
var FlowInputValidationError = class _FlowInputValidationError extends Error {
|
|
1107
|
+
/**
|
|
1108
|
+
* @param message - Human-readable error message
|
|
1109
|
+
* @param detectedType - The actual MIME type detected from the input
|
|
1110
|
+
* @param acceptedTypes - List of MIME types that would have been accepted
|
|
1111
|
+
*/
|
|
1112
|
+
constructor(message, detectedType, acceptedTypes) {
|
|
1113
|
+
super(message);
|
|
1114
|
+
this.detectedType = detectedType;
|
|
1115
|
+
this.acceptedTypes = acceptedTypes;
|
|
1116
|
+
this.name = "FlowInputValidationError";
|
|
1117
|
+
Object.setPrototypeOf(this, _FlowInputValidationError.prototype);
|
|
1118
|
+
}
|
|
1119
|
+
};
|
|
1120
|
+
function validateFlowInputFormat(input, acceptedFormats) {
|
|
1121
|
+
if (!input) {
|
|
1122
|
+
throw new FlowInputValidationError(
|
|
1123
|
+
"Flow input is empty or undefined",
|
|
1124
|
+
"undefined",
|
|
1125
|
+
acceptedFormats
|
|
1126
|
+
);
|
|
1127
|
+
}
|
|
1128
|
+
const detected = detectDocumentType(input);
|
|
1129
|
+
if (detected === "unknown") {
|
|
1130
|
+
const acceptedList = acceptedFormats.length > 0 ? `Expected one of: ${acceptedFormats.join(", ")}` : "Unable to determine document format";
|
|
1131
|
+
throw new FlowInputValidationError(
|
|
1132
|
+
`Unable to detect document format. ${acceptedList}. Ensure the input is a valid document (PDF, JPEG, PNG, GIF, or WebP).`,
|
|
1133
|
+
"unknown",
|
|
1134
|
+
acceptedFormats
|
|
1135
|
+
);
|
|
1136
|
+
}
|
|
1137
|
+
if (acceptedFormats.length > 0 && !acceptedFormats.includes(detected)) {
|
|
1138
|
+
throw new FlowInputValidationError(
|
|
1139
|
+
`Document format '${detected}' is not accepted. Expected one of: ${acceptedFormats.join(", ")}`,
|
|
1140
|
+
detected,
|
|
1141
|
+
acceptedFormats
|
|
1142
|
+
);
|
|
1143
|
+
}
|
|
1144
|
+
return detected;
|
|
1145
|
+
}
|
|
1146
|
+
|
|
1147
|
+
// src/pdf-utils.ts
|
|
1148
|
+
import { PDFDocument } from "pdf-lib";
|
|
1149
|
+
async function getPDFPageCount(dataUrl) {
|
|
1150
|
+
const base64Match = dataUrl.match(/^data:application\/pdf;base64,(.+)$/);
|
|
1151
|
+
if (!base64Match) {
|
|
1152
|
+
throw new Error("Invalid PDF data URL format. Expected: data:application/pdf;base64,{base64data}");
|
|
1153
|
+
}
|
|
1154
|
+
const base64Data = base64Match[1];
|
|
1155
|
+
const pdfBytes = base64ToArrayBuffer(base64Data);
|
|
1156
|
+
const pdfDoc = await PDFDocument.load(pdfBytes);
|
|
1157
|
+
return pdfDoc.getPageCount();
|
|
1158
|
+
}
|
|
1159
|
+
async function splitPDFIntoChunks(dataUrl, pageRanges) {
|
|
1160
|
+
const base64Match = dataUrl.match(/^data:application\/pdf;base64,(.+)$/);
|
|
1161
|
+
if (!base64Match) {
|
|
1162
|
+
throw new Error("Invalid PDF data URL format. Expected: data:application/pdf;base64,{base64data}");
|
|
1163
|
+
}
|
|
1164
|
+
const base64Data = base64Match[1];
|
|
1165
|
+
const pdfBytes = base64ToArrayBuffer(base64Data);
|
|
1166
|
+
const pdfDoc = await PDFDocument.load(pdfBytes);
|
|
1167
|
+
const totalPages = pdfDoc.getPageCount();
|
|
1168
|
+
const chunks = [];
|
|
1169
|
+
for (const [startPage, endPage] of pageRanges) {
|
|
1170
|
+
if (startPage < 1 || endPage > totalPages || startPage > endPage) {
|
|
1171
|
+
throw new Error(
|
|
1172
|
+
`Invalid page range [${startPage}, ${endPage}] for PDF with ${totalPages} pages. Page numbers must be 1-indexed and within bounds.`
|
|
1173
|
+
);
|
|
1174
|
+
}
|
|
1175
|
+
const chunkDoc = await PDFDocument.create();
|
|
1176
|
+
const pagesToCopy = Array.from(
|
|
1177
|
+
{ length: endPage - startPage + 1 },
|
|
1178
|
+
(_, i) => startPage - 1 + i
|
|
1179
|
+
// Convert to 0-indexed
|
|
1180
|
+
);
|
|
1181
|
+
const copiedPages = await chunkDoc.copyPages(pdfDoc, pagesToCopy);
|
|
1182
|
+
copiedPages.forEach((page) => chunkDoc.addPage(page));
|
|
1183
|
+
const chunkBytes = await chunkDoc.save();
|
|
1184
|
+
const chunkBase64 = uint8ArrayToBase64(chunkBytes);
|
|
1185
|
+
chunks.push(`data:application/pdf;base64,${chunkBase64}`);
|
|
1186
|
+
}
|
|
1187
|
+
return chunks;
|
|
1188
|
+
}
|
|
1189
|
+
function getDocumentPageCount(ir) {
|
|
1190
|
+
if (ir.extras?.pageCount !== void 0) {
|
|
1191
|
+
return ir.extras.pageCount;
|
|
1192
|
+
}
|
|
1193
|
+
return ir.pages.length;
|
|
1194
|
+
}
|
|
1195
|
+
function getTotalPageCount(irArray) {
|
|
1196
|
+
return irArray.reduce((sum, ir) => sum + getDocumentPageCount(ir), 0);
|
|
1197
|
+
}
|
|
1198
|
+
function getPageCountMetadata(ir) {
|
|
1199
|
+
const pagesInIR = ir.pages.length;
|
|
1200
|
+
const pageCount = ir.extras?.pageCount ?? pagesInIR;
|
|
1201
|
+
const isSemanticChunking = ir.extras?.totalSemanticChunks !== void 0;
|
|
1202
|
+
const isChunked = ir.extras?.chunkIndex !== void 0 && ir.extras?.totalChunks !== void 0;
|
|
1203
|
+
return {
|
|
1204
|
+
pageCount,
|
|
1205
|
+
pagesInIR,
|
|
1206
|
+
isChunked,
|
|
1207
|
+
chunkIndex: ir.extras?.chunkIndex,
|
|
1208
|
+
totalChunks: ir.extras?.totalChunks,
|
|
1209
|
+
pageRange: ir.extras?.pageRange,
|
|
1210
|
+
totalSemanticChunks: ir.extras?.totalSemanticChunks,
|
|
1211
|
+
isSemanticChunking
|
|
1212
|
+
};
|
|
1213
|
+
}
|
|
1214
|
+
|
|
1215
|
+
// src/provider-config.ts
|
|
1216
|
+
function defineVLMProvider(config) {
|
|
1217
|
+
return {
|
|
1218
|
+
type: "vlm",
|
|
1219
|
+
...config
|
|
1220
|
+
};
|
|
1221
|
+
}
|
|
1222
|
+
function defineSuryaProvider(config) {
|
|
1223
|
+
return {
|
|
1224
|
+
type: "ocr",
|
|
1225
|
+
...config
|
|
1226
|
+
};
|
|
1227
|
+
}
|
|
1228
|
+
function defineMarkerProvider(config) {
|
|
1229
|
+
return {
|
|
1230
|
+
type: "ocr",
|
|
1231
|
+
...config
|
|
1232
|
+
};
|
|
1233
|
+
}
|
|
1234
|
+
async function buildProviderFromConfig(config, secrets) {
|
|
1235
|
+
const secret = secrets[config.id];
|
|
1236
|
+
if (!secret || !secret.apiKey) {
|
|
1237
|
+
throw new Error(`API key not found for provider "${config.id}"`);
|
|
1238
|
+
}
|
|
1239
|
+
if (config.type === "vlm") {
|
|
1240
|
+
try {
|
|
1241
|
+
const module = await import(
|
|
1242
|
+
/* webpackIgnore: true */
|
|
1243
|
+
"@doclo/providers-llm"
|
|
1244
|
+
);
|
|
1245
|
+
const createVLMProvider = module.createVLMProvider || module.default?.createVLMProvider;
|
|
1246
|
+
if (!createVLMProvider) {
|
|
1247
|
+
throw new Error("@doclo/providers-llm does not export createVLMProvider");
|
|
1248
|
+
}
|
|
1249
|
+
return createVLMProvider({
|
|
1250
|
+
provider: config.provider,
|
|
1251
|
+
model: config.model,
|
|
1252
|
+
apiKey: secret.apiKey,
|
|
1253
|
+
via: config.via === "openrouter" ? "openrouter" : void 0,
|
|
1254
|
+
baseUrl: config.baseUrl
|
|
1255
|
+
});
|
|
1256
|
+
} catch (error) {
|
|
1257
|
+
throw new Error(
|
|
1258
|
+
`Failed to create VLM provider: ${error.message}. Make sure @doclo/providers-llm is installed.`
|
|
1259
|
+
);
|
|
1260
|
+
}
|
|
1261
|
+
} else if (config.type === "ocr") {
|
|
1262
|
+
try {
|
|
1263
|
+
const module = await import(
|
|
1264
|
+
/* webpackIgnore: true */
|
|
1265
|
+
"@doclo/providers-datalab"
|
|
1266
|
+
);
|
|
1267
|
+
if (config.provider === "surya") {
|
|
1268
|
+
const suryaProvider = module.suryaProvider || module.default?.suryaProvider;
|
|
1269
|
+
if (!suryaProvider) {
|
|
1270
|
+
throw new Error("@doclo/providers-datalab does not export suryaProvider");
|
|
1271
|
+
}
|
|
1272
|
+
return suryaProvider({
|
|
1273
|
+
endpoint: config.endpoint,
|
|
1274
|
+
apiKey: secret.apiKey
|
|
1275
|
+
});
|
|
1276
|
+
} else if (config.provider === "marker") {
|
|
1277
|
+
const markerProvider = module.markerProvider || module.default?.markerProvider;
|
|
1278
|
+
if (!markerProvider) {
|
|
1279
|
+
throw new Error("@doclo/providers-datalab does not export markerProvider");
|
|
1280
|
+
}
|
|
1281
|
+
return markerProvider({
|
|
1282
|
+
apiKey: secret.apiKey,
|
|
1283
|
+
force_ocr: config.force_ocr,
|
|
1284
|
+
use_llm: config.use_llm
|
|
1285
|
+
});
|
|
1286
|
+
} else {
|
|
1287
|
+
const exhaustiveCheck = config;
|
|
1288
|
+
throw new Error(`Unknown OCR provider: ${exhaustiveCheck.provider}`);
|
|
1289
|
+
}
|
|
1290
|
+
} catch (error) {
|
|
1291
|
+
throw new Error(
|
|
1292
|
+
`Failed to create OCR provider: ${error.message}. Make sure @doclo/providers-datalab is installed.`
|
|
1293
|
+
);
|
|
1294
|
+
}
|
|
1295
|
+
} else {
|
|
1296
|
+
const exhaustiveCheck = config;
|
|
1297
|
+
throw new Error(`Unknown provider type: ${exhaustiveCheck.type}`);
|
|
1298
|
+
}
|
|
1299
|
+
}
|
|
1300
|
+
async function buildProvidersFromConfigs(configs, secrets) {
|
|
1301
|
+
const registry = {};
|
|
1302
|
+
for (const config of configs) {
|
|
1303
|
+
try {
|
|
1304
|
+
registry[config.id] = await buildProviderFromConfig(config, secrets);
|
|
1305
|
+
} catch (error) {
|
|
1306
|
+
throw new Error(
|
|
1307
|
+
`Failed to build provider "${config.id}": ${error.message}`
|
|
1308
|
+
);
|
|
1309
|
+
}
|
|
1310
|
+
}
|
|
1311
|
+
return registry;
|
|
1312
|
+
}
|
|
1313
|
+
|
|
1314
|
+
// src/provider-identity.ts
|
|
1315
|
+
function toProviderString(identity) {
|
|
1316
|
+
return `${identity.provider}:${identity.model}`;
|
|
1317
|
+
}
|
|
1318
|
+
function parseProviderString(str) {
|
|
1319
|
+
const colonIndex = str.indexOf(":");
|
|
1320
|
+
if (colonIndex === -1) {
|
|
1321
|
+
return { provider: str, model: str };
|
|
1322
|
+
}
|
|
1323
|
+
return {
|
|
1324
|
+
provider: str.slice(0, colonIndex),
|
|
1325
|
+
model: str.slice(colonIndex + 1)
|
|
1326
|
+
};
|
|
1327
|
+
}
|
|
1328
|
+
function isLocalEndpoint(endpoint) {
|
|
1329
|
+
if (!endpoint) return false;
|
|
1330
|
+
return endpoint.includes("localhost") || endpoint.includes("127.0.0.1") || endpoint.includes("0.0.0.0") || endpoint.startsWith("http://192.168.") || endpoint.startsWith("http://10.");
|
|
1331
|
+
}
|
|
1332
|
+
function createIdentity(provider, model, opts) {
|
|
1333
|
+
let method = "native";
|
|
1334
|
+
if (opts?.via === "openrouter") {
|
|
1335
|
+
method = "openrouter";
|
|
1336
|
+
} else if (isLocalEndpoint(opts?.endpoint)) {
|
|
1337
|
+
method = "self-hosted";
|
|
1338
|
+
}
|
|
1339
|
+
return { provider, model, method };
|
|
1340
|
+
}
|
|
1341
|
+
|
|
1342
|
+
// src/provider-query.ts
|
|
1343
|
+
var providerRegistry = /* @__PURE__ */ new Map();
|
|
1344
|
+
function registerProviderMetadata(source, metadata, normalizer) {
|
|
1345
|
+
const normalized = /* @__PURE__ */ new Map();
|
|
1346
|
+
for (const [id, data] of Object.entries(metadata)) {
|
|
1347
|
+
if (normalizer) {
|
|
1348
|
+
normalized.set(id, normalizer(id, data, source));
|
|
1349
|
+
} else {
|
|
1350
|
+
normalized.set(id, defaultNormalizer(id, data, source));
|
|
1351
|
+
}
|
|
1352
|
+
}
|
|
1353
|
+
providerRegistry.set(source, normalized);
|
|
1354
|
+
}
|
|
1355
|
+
function getAllProviders() {
|
|
1356
|
+
const all = [];
|
|
1357
|
+
for (const providers of providerRegistry.values()) {
|
|
1358
|
+
all.push(...providers.values());
|
|
1359
|
+
}
|
|
1360
|
+
return all;
|
|
1361
|
+
}
|
|
1362
|
+
function queryProviders(filter = {}) {
|
|
1363
|
+
let providers = getAllProviders();
|
|
1364
|
+
if (filter.source) {
|
|
1365
|
+
const sources = Array.isArray(filter.source) ? filter.source : [filter.source];
|
|
1366
|
+
providers = providers.filter((p) => sources.includes(p.source));
|
|
1367
|
+
}
|
|
1368
|
+
if (filter.type) {
|
|
1369
|
+
const types = Array.isArray(filter.type) ? filter.type : [filter.type];
|
|
1370
|
+
providers = providers.filter((p) => types.includes(p.type));
|
|
1371
|
+
}
|
|
1372
|
+
if (filter.provider) {
|
|
1373
|
+
const providerVendors = Array.isArray(filter.provider) ? filter.provider : [filter.provider];
|
|
1374
|
+
providers = providers.filter((p) => p.identity?.provider && providerVendors.includes(p.identity.provider));
|
|
1375
|
+
}
|
|
1376
|
+
if (filter.model) {
|
|
1377
|
+
const models = Array.isArray(filter.model) ? filter.model : [filter.model];
|
|
1378
|
+
providers = providers.filter((p) => p.identity?.model && models.includes(p.identity.model));
|
|
1379
|
+
}
|
|
1380
|
+
if (filter.method) {
|
|
1381
|
+
const methods = Array.isArray(filter.method) ? filter.method : [filter.method];
|
|
1382
|
+
providers = providers.filter((p) => p.identity?.method && methods.includes(p.identity.method));
|
|
1383
|
+
}
|
|
1384
|
+
if (filter.supports) {
|
|
1385
|
+
if (filter.supports.images !== void 0) {
|
|
1386
|
+
providers = providers.filter((p) => p.capabilities.supportsImages === filter.supports.images);
|
|
1387
|
+
}
|
|
1388
|
+
if (filter.supports.pdfs !== void 0) {
|
|
1389
|
+
providers = providers.filter((p) => p.capabilities.supportsPDFs === filter.supports.pdfs);
|
|
1390
|
+
}
|
|
1391
|
+
if (filter.supports.documents !== void 0) {
|
|
1392
|
+
providers = providers.filter((p) => p.capabilities.supportsDocuments === filter.supports.documents);
|
|
1393
|
+
}
|
|
1394
|
+
if (filter.supports.reasoning !== void 0) {
|
|
1395
|
+
providers = providers.filter((p) => p.capabilities.supportsReasoning === filter.supports.reasoning);
|
|
1396
|
+
}
|
|
1397
|
+
if (filter.supports.structuredOutput !== void 0) {
|
|
1398
|
+
providers = providers.filter((p) => p.capabilities.supportsStructuredOutput === filter.supports.structuredOutput);
|
|
1399
|
+
}
|
|
1400
|
+
if (filter.supports.prompts !== void 0) {
|
|
1401
|
+
providers = providers.filter((p) => p.capabilities.supportsPrompts === filter.supports.prompts);
|
|
1402
|
+
}
|
|
1403
|
+
if (filter.supports.citations !== void 0) {
|
|
1404
|
+
providers = providers.filter((p) => p.capabilities.supportsCitations === filter.supports.citations);
|
|
1405
|
+
}
|
|
1406
|
+
if (filter.supports.chunking !== void 0) {
|
|
1407
|
+
providers = providers.filter((p) => p.capabilities.supportsChunking === filter.supports.chunking);
|
|
1408
|
+
}
|
|
1409
|
+
if (filter.supports.imageExtraction !== void 0) {
|
|
1410
|
+
providers = providers.filter((p) => p.capabilities.supportsImageExtraction === filter.supports.imageExtraction);
|
|
1411
|
+
}
|
|
1412
|
+
if (filter.supports.pageMarkers !== void 0) {
|
|
1413
|
+
providers = providers.filter((p) => p.capabilities.supportsPageMarkers === filter.supports.pageMarkers);
|
|
1414
|
+
}
|
|
1415
|
+
if (filter.supports.languageHints !== void 0) {
|
|
1416
|
+
providers = providers.filter((p) => p.capabilities.supportsLanguageHints === filter.supports.languageHints);
|
|
1417
|
+
}
|
|
1418
|
+
if (filter.supports.processingModes !== void 0) {
|
|
1419
|
+
providers = providers.filter((p) => p.capabilities.supportsProcessingModes === filter.supports.processingModes);
|
|
1420
|
+
}
|
|
1421
|
+
if (filter.supports.segmentation !== void 0) {
|
|
1422
|
+
providers = providers.filter((p) => p.capabilities.supportsSegmentation === filter.supports.segmentation);
|
|
1423
|
+
}
|
|
1424
|
+
}
|
|
1425
|
+
if (filter.hasFeatures && filter.hasFeatures.length > 0) {
|
|
1426
|
+
providers = providers.filter(
|
|
1427
|
+
(p) => filter.hasFeatures.every((feature) => p.features[feature] === true)
|
|
1428
|
+
);
|
|
1429
|
+
}
|
|
1430
|
+
if (filter.outputFormat) {
|
|
1431
|
+
providers = providers.filter(
|
|
1432
|
+
(p) => p.capabilities.outputFormats[filter.outputFormat] === true
|
|
1433
|
+
);
|
|
1434
|
+
}
|
|
1435
|
+
if (filter.inputRequirements?.inputType !== void 0) {
|
|
1436
|
+
const inputTypes = Array.isArray(filter.inputRequirements.inputType) ? filter.inputRequirements.inputType : [filter.inputRequirements.inputType];
|
|
1437
|
+
providers = providers.filter((p) => inputTypes.includes(p.inputRequirements.inputType));
|
|
1438
|
+
}
|
|
1439
|
+
if (filter.compatibleWith && filter.compatibleWith.length > 0) {
|
|
1440
|
+
providers = providers.filter(
|
|
1441
|
+
(p) => filter.compatibleWith.every((node2) => p.compatibleNodes[node2])
|
|
1442
|
+
);
|
|
1443
|
+
}
|
|
1444
|
+
if (filter.mimeType) {
|
|
1445
|
+
const mimeTypes = Array.isArray(filter.mimeType) ? filter.mimeType : [filter.mimeType];
|
|
1446
|
+
providers = providers.filter((p) => {
|
|
1447
|
+
const allMimes = [...p.inputFormats.imageMimeTypes, ...p.inputFormats.documentMimeTypes];
|
|
1448
|
+
return mimeTypes.every((mime) => allMimes.includes(mime));
|
|
1449
|
+
});
|
|
1450
|
+
}
|
|
1451
|
+
if (filter.minFileSize !== void 0) {
|
|
1452
|
+
providers = providers.filter((p) => {
|
|
1453
|
+
const maxSize = p.inputFormats.maxFileSize ?? Math.max(p.inputFormats.maxImageSize ?? 0, p.inputFormats.maxPdfSize ?? 0);
|
|
1454
|
+
return maxSize >= filter.minFileSize;
|
|
1455
|
+
});
|
|
1456
|
+
}
|
|
1457
|
+
if (filter.maxFileSize !== void 0) {
|
|
1458
|
+
providers = providers.filter((p) => {
|
|
1459
|
+
const maxSize = p.inputFormats.maxFileSize ?? Math.max(p.inputFormats.maxImageSize ?? Infinity, p.inputFormats.maxPdfSize ?? Infinity);
|
|
1460
|
+
return maxSize <= filter.maxFileSize;
|
|
1461
|
+
});
|
|
1462
|
+
}
|
|
1463
|
+
if (filter.maxCostPerPage !== void 0) {
|
|
1464
|
+
providers = providers.filter(
|
|
1465
|
+
(p) => p.pricing.perPage !== void 0 && p.pricing.perPage <= filter.maxCostPerPage
|
|
1466
|
+
);
|
|
1467
|
+
}
|
|
1468
|
+
if (filter.maxCostPer1kTokens !== void 0) {
|
|
1469
|
+
providers = providers.filter(
|
|
1470
|
+
(p) => p.pricing.inputPer1kTokens !== void 0 && p.pricing.inputPer1kTokens <= filter.maxCostPer1kTokens
|
|
1471
|
+
);
|
|
1472
|
+
}
|
|
1473
|
+
if (filter.filter) {
|
|
1474
|
+
providers = providers.filter(filter.filter);
|
|
1475
|
+
}
|
|
1476
|
+
return providers;
|
|
1477
|
+
}
|
|
1478
|
+
function getProviderById(id) {
|
|
1479
|
+
for (const providers of providerRegistry.values()) {
|
|
1480
|
+
if (providers.has(id)) {
|
|
1481
|
+
return providers.get(id);
|
|
1482
|
+
}
|
|
1483
|
+
}
|
|
1484
|
+
return void 0;
|
|
1485
|
+
}
|
|
1486
|
+
function getProvidersBySource(source) {
|
|
1487
|
+
const providers = providerRegistry.get(source);
|
|
1488
|
+
return providers ? [...providers.values()] : [];
|
|
1489
|
+
}
|
|
1490
|
+
function clearProviderRegistry() {
|
|
1491
|
+
providerRegistry.clear();
|
|
1492
|
+
}
|
|
1493
|
+
function defaultNormalizer(id, data, source) {
|
|
1494
|
+
const d = data;
|
|
1495
|
+
if (source === "llm") {
|
|
1496
|
+
return normalizeLLMProvider(id, d);
|
|
1497
|
+
} else if (source === "datalab") {
|
|
1498
|
+
return normalizeDatalabProvider(id, d);
|
|
1499
|
+
} else if (source === "reducto") {
|
|
1500
|
+
return normalizeReductoProvider(id, d);
|
|
1501
|
+
} else if (source === "unsiloed") {
|
|
1502
|
+
return normalizeUnsiloedProvider(id, d);
|
|
1503
|
+
}
|
|
1504
|
+
const defaultOutputFormats = { text: true, markdown: false, html: false, json: false };
|
|
1505
|
+
const defaultFeatures = {
|
|
1506
|
+
maxPages: false,
|
|
1507
|
+
pageRange: false,
|
|
1508
|
+
languageHints: false,
|
|
1509
|
+
processingModes: false,
|
|
1510
|
+
agenticMode: false,
|
|
1511
|
+
customPrompts: false,
|
|
1512
|
+
imageExtraction: false,
|
|
1513
|
+
pageMarkers: false,
|
|
1514
|
+
citations: false,
|
|
1515
|
+
chunking: false,
|
|
1516
|
+
segmentation: false,
|
|
1517
|
+
stripExistingOCR: false,
|
|
1518
|
+
formatLines: false,
|
|
1519
|
+
forceOCR: false,
|
|
1520
|
+
tableOutputFormats: false,
|
|
1521
|
+
tableMerging: false,
|
|
1522
|
+
confidence: false,
|
|
1523
|
+
boundingBoxes: false,
|
|
1524
|
+
schemaValidation: false,
|
|
1525
|
+
handwrittenText: false,
|
|
1526
|
+
outputFormats: defaultOutputFormats
|
|
1527
|
+
};
|
|
1528
|
+
return {
|
|
1529
|
+
id,
|
|
1530
|
+
name: d.name ?? id,
|
|
1531
|
+
source,
|
|
1532
|
+
type: d.type ?? "LLM",
|
|
1533
|
+
capabilities: {
|
|
1534
|
+
supportsImages: d.capabilities?.supportsImages ?? false,
|
|
1535
|
+
supportsPDFs: d.capabilities?.supportsPDFs ?? false,
|
|
1536
|
+
supportsDocuments: d.capabilities?.supportsDocuments ?? false,
|
|
1537
|
+
supportsReasoning: d.capabilities?.supportsReasoning ?? false,
|
|
1538
|
+
supportsStructuredOutput: d.capabilities?.supportsStructuredOutput ?? false,
|
|
1539
|
+
supportsPrompts: false,
|
|
1540
|
+
supportsCitations: false,
|
|
1541
|
+
supportsChunking: false,
|
|
1542
|
+
supportsImageExtraction: false,
|
|
1543
|
+
supportsPageMarkers: false,
|
|
1544
|
+
supportsLanguageHints: false,
|
|
1545
|
+
supportsProcessingModes: false,
|
|
1546
|
+
supportsSegmentation: false,
|
|
1547
|
+
outputFormats: defaultOutputFormats
|
|
1548
|
+
},
|
|
1549
|
+
features: defaultFeatures,
|
|
1550
|
+
inputRequirements: {
|
|
1551
|
+
inputType: d.inputRequirements?.inputType ?? "any",
|
|
1552
|
+
acceptedMethods: d.inputRequirements?.acceptedMethods ?? d.inputFormats?.inputMethods ?? ["base64"]
|
|
1553
|
+
},
|
|
1554
|
+
compatibleNodes: {
|
|
1555
|
+
parse: d.compatibleNodes?.parse ?? false,
|
|
1556
|
+
extract: d.compatibleNodes?.extract ?? false,
|
|
1557
|
+
categorize: d.compatibleNodes?.categorize ?? false,
|
|
1558
|
+
qualify: d.compatibleNodes?.qualify ?? false,
|
|
1559
|
+
split: d.compatibleNodes?.split ?? false
|
|
1560
|
+
},
|
|
1561
|
+
inputFormats: {
|
|
1562
|
+
imageMimeTypes: [],
|
|
1563
|
+
documentMimeTypes: [],
|
|
1564
|
+
inputMethods: ["base64"]
|
|
1565
|
+
},
|
|
1566
|
+
pricing: {
|
|
1567
|
+
model: "per-token",
|
|
1568
|
+
currency: "USD"
|
|
1569
|
+
},
|
|
1570
|
+
raw: data
|
|
1571
|
+
};
|
|
1572
|
+
}
|
|
1573
|
+
function normalizeLLMProvider(id, d) {
|
|
1574
|
+
const outputFormats = {
|
|
1575
|
+
text: true,
|
|
1576
|
+
markdown: true,
|
|
1577
|
+
html: true,
|
|
1578
|
+
json: d.capabilities?.supportsStructuredOutput ?? true
|
|
1579
|
+
};
|
|
1580
|
+
const features = {
|
|
1581
|
+
maxPages: d.inputFormats?.pdfs?.maxPages !== void 0,
|
|
1582
|
+
pageRange: true,
|
|
1583
|
+
// LLMs can handle page ranges
|
|
1584
|
+
languageHints: false,
|
|
1585
|
+
// Not applicable to LLMs
|
|
1586
|
+
processingModes: false,
|
|
1587
|
+
// Not applicable to LLMs
|
|
1588
|
+
agenticMode: false,
|
|
1589
|
+
// Not applicable to LLMs
|
|
1590
|
+
customPrompts: true,
|
|
1591
|
+
// All LLMs support prompts
|
|
1592
|
+
imageExtraction: false,
|
|
1593
|
+
// LLMs don't extract images
|
|
1594
|
+
pageMarkers: false,
|
|
1595
|
+
// LLMs don't add page markers
|
|
1596
|
+
citations: false,
|
|
1597
|
+
// Most LLMs don't have native citations (Anthropic has different API)
|
|
1598
|
+
chunking: false,
|
|
1599
|
+
// LLMs don't do chunking
|
|
1600
|
+
segmentation: false,
|
|
1601
|
+
// LLMs don't do segmentation
|
|
1602
|
+
stripExistingOCR: false,
|
|
1603
|
+
formatLines: false,
|
|
1604
|
+
forceOCR: false,
|
|
1605
|
+
tableOutputFormats: false,
|
|
1606
|
+
tableMerging: false,
|
|
1607
|
+
confidence: false,
|
|
1608
|
+
// LLMs don't provide confidence scores
|
|
1609
|
+
boundingBoxes: false,
|
|
1610
|
+
// LLMs don't provide bounding boxes
|
|
1611
|
+
schemaValidation: d.capabilities?.supportsStructuredOutput ?? false,
|
|
1612
|
+
// Some LLMs support schema validation
|
|
1613
|
+
handwrittenText: false,
|
|
1614
|
+
// Not specific to LLMs
|
|
1615
|
+
outputFormats
|
|
1616
|
+
};
|
|
1617
|
+
const vendor = d.vendor ?? id;
|
|
1618
|
+
return {
|
|
1619
|
+
id,
|
|
1620
|
+
name: d.name ?? id,
|
|
1621
|
+
source: "llm",
|
|
1622
|
+
type: "LLM",
|
|
1623
|
+
// NEW: 3-layer identity
|
|
1624
|
+
identity: {
|
|
1625
|
+
provider: vendor,
|
|
1626
|
+
model: d.defaultModel ?? id,
|
|
1627
|
+
method: "native"
|
|
1628
|
+
},
|
|
1629
|
+
capabilities: {
|
|
1630
|
+
supportsImages: d.capabilities?.supportsImages ?? false,
|
|
1631
|
+
supportsPDFs: d.capabilities?.supportsPDFs ?? false,
|
|
1632
|
+
supportsDocuments: false,
|
|
1633
|
+
// LLM providers don't support Office docs directly
|
|
1634
|
+
supportsReasoning: d.capabilities?.supportsReasoning ?? false,
|
|
1635
|
+
supportsStructuredOutput: d.capabilities?.supportsStructuredOutput ?? false,
|
|
1636
|
+
// NEW capabilities
|
|
1637
|
+
supportsPrompts: true,
|
|
1638
|
+
supportsCitations: false,
|
|
1639
|
+
supportsChunking: false,
|
|
1640
|
+
supportsImageExtraction: false,
|
|
1641
|
+
supportsPageMarkers: false,
|
|
1642
|
+
supportsLanguageHints: false,
|
|
1643
|
+
supportsProcessingModes: false,
|
|
1644
|
+
supportsSegmentation: false,
|
|
1645
|
+
outputFormats
|
|
1646
|
+
},
|
|
1647
|
+
features,
|
|
1648
|
+
// LLM providers with vision can work with either raw documents or parsed text
|
|
1649
|
+
inputRequirements: {
|
|
1650
|
+
inputType: d.inputRequirements?.inputType ?? "any",
|
|
1651
|
+
acceptedMethods: d.inputRequirements?.acceptedMethods ?? d.inputFormats?.images?.methods ?? ["base64", "url"]
|
|
1652
|
+
},
|
|
1653
|
+
compatibleNodes: {
|
|
1654
|
+
parse: d.compatibleNodes?.parse ?? false,
|
|
1655
|
+
extract: d.compatibleNodes?.extract ?? false,
|
|
1656
|
+
categorize: d.compatibleNodes?.categorize ?? false,
|
|
1657
|
+
qualify: d.compatibleNodes?.qualify ?? false,
|
|
1658
|
+
split: d.compatibleNodes?.split ?? false
|
|
1659
|
+
},
|
|
1660
|
+
inputFormats: {
|
|
1661
|
+
imageMimeTypes: d.inputFormats?.images?.mimeTypes ?? [],
|
|
1662
|
+
documentMimeTypes: ["application/pdf"],
|
|
1663
|
+
// PDFs only for LLM
|
|
1664
|
+
inputMethods: d.inputFormats?.images?.methods ?? ["base64"],
|
|
1665
|
+
maxImageSize: d.inputFormats?.images?.maxSize,
|
|
1666
|
+
maxPdfSize: d.inputFormats?.pdfs?.maxSize,
|
|
1667
|
+
maxPages: d.inputFormats?.pdfs?.maxPages
|
|
1668
|
+
},
|
|
1669
|
+
pricing: {
|
|
1670
|
+
model: "per-token",
|
|
1671
|
+
inputPer1kTokens: d.pricing?.inputPer1k,
|
|
1672
|
+
outputPer1kTokens: d.pricing?.outputPer1k,
|
|
1673
|
+
currency: "USD",
|
|
1674
|
+
notes: d.pricing?.notes
|
|
1675
|
+
},
|
|
1676
|
+
rateLimits: {
|
|
1677
|
+
requestsPerMinute: d.limits?.requestsPerMinute
|
|
1678
|
+
},
|
|
1679
|
+
raw: d
|
|
1680
|
+
};
|
|
1681
|
+
}
|
|
1682
|
+
function normalizeDatalabProvider(id, d) {
|
|
1683
|
+
const opts = d.supportedOptions ?? {};
|
|
1684
|
+
const isVLM = d.type === "VLM";
|
|
1685
|
+
const model = d.model ?? id;
|
|
1686
|
+
const outputFormats = {
|
|
1687
|
+
text: true,
|
|
1688
|
+
markdown: d.outputFormat?.features?.markdown ?? false,
|
|
1689
|
+
html: false,
|
|
1690
|
+
json: d.outputFormat?.features?.structuredJSON ?? isVLM
|
|
1691
|
+
};
|
|
1692
|
+
const features = {
|
|
1693
|
+
maxPages: opts.maxPages ?? false,
|
|
1694
|
+
pageRange: opts.pageRange ?? false,
|
|
1695
|
+
languageHints: opts.langs ?? false,
|
|
1696
|
+
// maps from 'langs'
|
|
1697
|
+
processingModes: opts.mode ?? false,
|
|
1698
|
+
agenticMode: false,
|
|
1699
|
+
// Datalab doesn't have agentic mode
|
|
1700
|
+
customPrompts: opts.blockCorrectionPrompt ?? false,
|
|
1701
|
+
imageExtraction: opts.extractImages ?? false,
|
|
1702
|
+
pageMarkers: opts.paginate ?? false,
|
|
1703
|
+
// maps from 'paginate'
|
|
1704
|
+
citations: opts.citations ?? false,
|
|
1705
|
+
chunking: false,
|
|
1706
|
+
// Datalab doesn't have chunking
|
|
1707
|
+
segmentation: opts.segmentation ?? false,
|
|
1708
|
+
stripExistingOCR: opts.stripExistingOCR ?? false,
|
|
1709
|
+
formatLines: opts.formatLines ?? false,
|
|
1710
|
+
forceOCR: true,
|
|
1711
|
+
// Datalab supports force_ocr
|
|
1712
|
+
tableOutputFormats: false,
|
|
1713
|
+
tableMerging: false,
|
|
1714
|
+
confidence: false,
|
|
1715
|
+
// Datalab doesn't provide confidence scores
|
|
1716
|
+
boundingBoxes: d.outputFormat?.features?.boundingBoxes ?? true,
|
|
1717
|
+
// Datalab provides bounding boxes
|
|
1718
|
+
schemaValidation: isVLM,
|
|
1719
|
+
// VLM providers support schema validation
|
|
1720
|
+
handwrittenText: true,
|
|
1721
|
+
// Datalab handles handwritten text
|
|
1722
|
+
outputFormats
|
|
1723
|
+
};
|
|
1724
|
+
return {
|
|
1725
|
+
id,
|
|
1726
|
+
name: d.name ?? id,
|
|
1727
|
+
source: "datalab",
|
|
1728
|
+
type: d.type ?? "OCR",
|
|
1729
|
+
// NEW: 3-layer identity
|
|
1730
|
+
identity: {
|
|
1731
|
+
provider: "datalab",
|
|
1732
|
+
model,
|
|
1733
|
+
method: "native"
|
|
1734
|
+
// Default to native, can be overridden when self-hosted
|
|
1735
|
+
},
|
|
1736
|
+
capabilities: {
|
|
1737
|
+
supportsImages: d.capabilities?.supportsImages ?? true,
|
|
1738
|
+
supportsPDFs: d.capabilities?.supportsPDFs ?? true,
|
|
1739
|
+
supportsDocuments: d.capabilities?.supportsDocuments ?? true,
|
|
1740
|
+
supportsReasoning: false,
|
|
1741
|
+
// Datalab doesn't do reasoning
|
|
1742
|
+
supportsStructuredOutput: isVLM,
|
|
1743
|
+
// NEW capabilities from supportedOptions
|
|
1744
|
+
supportsPrompts: opts.blockCorrectionPrompt ?? false,
|
|
1745
|
+
supportsCitations: opts.citations ?? false,
|
|
1746
|
+
supportsChunking: false,
|
|
1747
|
+
supportsImageExtraction: opts.extractImages ?? false,
|
|
1748
|
+
supportsPageMarkers: opts.paginate ?? false,
|
|
1749
|
+
supportsLanguageHints: opts.langs ?? false,
|
|
1750
|
+
supportsProcessingModes: opts.mode ?? false,
|
|
1751
|
+
supportsSegmentation: opts.segmentation ?? false,
|
|
1752
|
+
outputFormats
|
|
1753
|
+
},
|
|
1754
|
+
features,
|
|
1755
|
+
// Datalab providers always need raw document input
|
|
1756
|
+
inputRequirements: {
|
|
1757
|
+
inputType: d.inputRequirements?.inputType ?? "raw-document",
|
|
1758
|
+
acceptedMethods: d.inputRequirements?.acceptedMethods ?? d.inputFormats?.inputMethods ?? ["base64", "url"]
|
|
1759
|
+
},
|
|
1760
|
+
compatibleNodes: {
|
|
1761
|
+
parse: d.compatibleNodes?.parse ?? false,
|
|
1762
|
+
extract: d.compatibleNodes?.extract ?? false,
|
|
1763
|
+
categorize: d.compatibleNodes?.categorize ?? false,
|
|
1764
|
+
qualify: d.compatibleNodes?.qualify ?? false,
|
|
1765
|
+
split: d.compatibleNodes?.split ?? false
|
|
1766
|
+
},
|
|
1767
|
+
inputFormats: {
|
|
1768
|
+
imageMimeTypes: (d.inputFormats?.mimeTypes ?? []).filter((m) => m.startsWith("image/")),
|
|
1769
|
+
documentMimeTypes: (d.inputFormats?.mimeTypes ?? []).filter((m) => !m.startsWith("image/")),
|
|
1770
|
+
inputMethods: d.inputFormats?.inputMethods ?? ["base64"],
|
|
1771
|
+
maxFileSize: d.inputFormats?.maxFileSize,
|
|
1772
|
+
maxPages: d.inputFormats?.maxPages
|
|
1773
|
+
},
|
|
1774
|
+
pricing: {
|
|
1775
|
+
model: "per-page",
|
|
1776
|
+
perPage: d.pricing?.perPage,
|
|
1777
|
+
currency: "USD",
|
|
1778
|
+
notes: d.pricing?.notes
|
|
1779
|
+
},
|
|
1780
|
+
rateLimits: {
|
|
1781
|
+
docsPerMinute: d.apiConfig?.rateLimit?.docsPerMinute
|
|
1782
|
+
},
|
|
1783
|
+
raw: d
|
|
1784
|
+
};
|
|
1785
|
+
}
|
|
1786
|
+
function normalizeReductoProvider(id, d) {
|
|
1787
|
+
const opts = d.supportedOptions ?? {};
|
|
1788
|
+
const isVLM = d.type === "VLM";
|
|
1789
|
+
const isExtract = d.compatibleNodes?.extract === true;
|
|
1790
|
+
const model = d.model ?? "v1";
|
|
1791
|
+
const outputFormats = {
|
|
1792
|
+
text: d.outputFormat?.features?.textLines ?? true,
|
|
1793
|
+
markdown: d.outputFormat?.features?.markdown ?? d.compatibleNodes?.parse ?? false,
|
|
1794
|
+
html: opts.tableOutputFormat ?? false,
|
|
1795
|
+
// Reducto can output HTML tables
|
|
1796
|
+
json: d.outputFormat?.features?.structuredJSON ?? isExtract
|
|
1797
|
+
};
|
|
1798
|
+
const features = {
|
|
1799
|
+
maxPages: opts.maxPages ?? false,
|
|
1800
|
+
pageRange: opts.pageRange ?? false,
|
|
1801
|
+
languageHints: opts.langs ?? false,
|
|
1802
|
+
// Reducto doesn't support langs
|
|
1803
|
+
processingModes: false,
|
|
1804
|
+
// Reducto uses agentic instead
|
|
1805
|
+
agenticMode: opts.mode ?? false,
|
|
1806
|
+
// maps from 'mode' (agentic)
|
|
1807
|
+
customPrompts: opts.additionalPrompt ?? false,
|
|
1808
|
+
// maps from 'additionalPrompt'
|
|
1809
|
+
imageExtraction: opts.extractImages ?? false,
|
|
1810
|
+
// maps from 'returnImages'
|
|
1811
|
+
pageMarkers: true,
|
|
1812
|
+
// Reducto has addPageMarkers
|
|
1813
|
+
citations: opts.citations ?? false,
|
|
1814
|
+
chunking: opts.chunking ?? false,
|
|
1815
|
+
segmentation: opts.segmentation ?? false,
|
|
1816
|
+
// Via Split endpoint
|
|
1817
|
+
stripExistingOCR: false,
|
|
1818
|
+
formatLines: false,
|
|
1819
|
+
forceOCR: false,
|
|
1820
|
+
tableOutputFormats: opts.tableOutputFormat ?? false,
|
|
1821
|
+
tableMerging: d.compatibleNodes?.parse ?? false,
|
|
1822
|
+
// Parse has mergeTables
|
|
1823
|
+
confidence: opts.confidence ?? d.outputFormat?.features?.confidence ?? false,
|
|
1824
|
+
// Reducto Parse has confidence
|
|
1825
|
+
boundingBoxes: d.outputFormat?.features?.boundingBoxes ?? d.compatibleNodes?.parse ?? false,
|
|
1826
|
+
// Reducto Parse has bounding boxes
|
|
1827
|
+
schemaValidation: d.outputFormat?.features?.schemaValidation ?? isExtract,
|
|
1828
|
+
// Extract has schema validation
|
|
1829
|
+
handwrittenText: false,
|
|
1830
|
+
// Reducto doesn't specifically advertise handwriting
|
|
1831
|
+
outputFormats
|
|
1832
|
+
};
|
|
1833
|
+
return {
|
|
1834
|
+
id,
|
|
1835
|
+
name: d.name ?? id,
|
|
1836
|
+
source: "reducto",
|
|
1837
|
+
type: d.type ?? "OCR",
|
|
1838
|
+
// NEW: 3-layer identity
|
|
1839
|
+
identity: {
|
|
1840
|
+
provider: "reducto",
|
|
1841
|
+
model,
|
|
1842
|
+
method: "native"
|
|
1843
|
+
},
|
|
1844
|
+
capabilities: {
|
|
1845
|
+
supportsImages: d.capabilities?.supportsImages ?? true,
|
|
1846
|
+
supportsPDFs: d.capabilities?.supportsPDFs ?? true,
|
|
1847
|
+
supportsDocuments: d.capabilities?.supportsDocuments ?? true,
|
|
1848
|
+
supportsReasoning: false,
|
|
1849
|
+
// Reducto doesn't do reasoning
|
|
1850
|
+
supportsStructuredOutput: isVLM || isExtract,
|
|
1851
|
+
// NEW capabilities from supportedOptions
|
|
1852
|
+
supportsPrompts: opts.additionalPrompt ?? false,
|
|
1853
|
+
supportsCitations: opts.citations ?? false,
|
|
1854
|
+
supportsChunking: opts.chunking ?? false,
|
|
1855
|
+
supportsImageExtraction: opts.extractImages ?? false,
|
|
1856
|
+
supportsPageMarkers: true,
|
|
1857
|
+
supportsLanguageHints: false,
|
|
1858
|
+
supportsProcessingModes: opts.mode ?? false,
|
|
1859
|
+
// agentic mode
|
|
1860
|
+
supportsSegmentation: opts.segmentation ?? false,
|
|
1861
|
+
outputFormats
|
|
1862
|
+
},
|
|
1863
|
+
features,
|
|
1864
|
+
// Reducto providers always need raw document input
|
|
1865
|
+
inputRequirements: {
|
|
1866
|
+
inputType: d.inputRequirements?.inputType ?? "raw-document",
|
|
1867
|
+
acceptedMethods: d.inputRequirements?.acceptedMethods ?? d.inputFormats?.inputMethods ?? ["base64", "url"]
|
|
1868
|
+
},
|
|
1869
|
+
compatibleNodes: {
|
|
1870
|
+
parse: d.compatibleNodes?.parse ?? false,
|
|
1871
|
+
extract: d.compatibleNodes?.extract ?? false,
|
|
1872
|
+
categorize: d.compatibleNodes?.categorize ?? false,
|
|
1873
|
+
qualify: d.compatibleNodes?.qualify ?? false,
|
|
1874
|
+
split: d.compatibleNodes?.split ?? false
|
|
1875
|
+
},
|
|
1876
|
+
inputFormats: {
|
|
1877
|
+
imageMimeTypes: (d.inputFormats?.mimeTypes ?? []).filter((m) => m.startsWith("image/")),
|
|
1878
|
+
documentMimeTypes: (d.inputFormats?.mimeTypes ?? []).filter((m) => !m.startsWith("image/")),
|
|
1879
|
+
inputMethods: d.inputFormats?.inputMethods ?? ["base64"],
|
|
1880
|
+
maxFileSize: d.inputFormats?.maxFileSize,
|
|
1881
|
+
maxPages: d.inputFormats?.maxPages
|
|
1882
|
+
},
|
|
1883
|
+
pricing: {
|
|
1884
|
+
model: "per-page",
|
|
1885
|
+
perPage: d.pricing?.standard ? d.pricing.standard * (d.pricing.usdPerCredit ?? 4e-3) : d.pricing?.perPage,
|
|
1886
|
+
currency: "USD",
|
|
1887
|
+
notes: d.pricing?.notes
|
|
1888
|
+
},
|
|
1889
|
+
rateLimits: {
|
|
1890
|
+
docsPerMinute: d.apiConfig?.rateLimit?.docsPerMinute
|
|
1891
|
+
},
|
|
1892
|
+
raw: d
|
|
1893
|
+
};
|
|
1894
|
+
}
|
|
1895
|
+
function normalizeUnsiloedProvider(id, d) {
|
|
1896
|
+
const isVLM = d.type === "VLM";
|
|
1897
|
+
const isExtract = d.compatibleNodes?.extract === true;
|
|
1898
|
+
const isParse = d.compatibleNodes?.parse === true;
|
|
1899
|
+
const isSplit = d.compatibleNodes?.split === true;
|
|
1900
|
+
const isCategorize = d.compatibleNodes?.categorize === true;
|
|
1901
|
+
const model = d.model ?? "v1";
|
|
1902
|
+
const outputFormats = {
|
|
1903
|
+
text: d.outputFormat?.features?.textLines ?? isParse,
|
|
1904
|
+
markdown: d.outputFormat?.features?.markdown ?? isParse,
|
|
1905
|
+
html: false,
|
|
1906
|
+
// Unsiloed doesn't output HTML
|
|
1907
|
+
json: d.outputFormat?.features?.structuredJSON ?? (isVLM || isExtract)
|
|
1908
|
+
};
|
|
1909
|
+
const features = {
|
|
1910
|
+
maxPages: false,
|
|
1911
|
+
// Unsiloed doesn't have max pages option
|
|
1912
|
+
pageRange: false,
|
|
1913
|
+
// Unsiloed doesn't have page range option
|
|
1914
|
+
languageHints: false,
|
|
1915
|
+
// Unsiloed doesn't support language hints
|
|
1916
|
+
processingModes: d.capabilities?.specialFeatures?.includes("YOLO segmentation") ?? false,
|
|
1917
|
+
agenticMode: false,
|
|
1918
|
+
// Unsiloed doesn't have agentic mode
|
|
1919
|
+
customPrompts: false,
|
|
1920
|
+
// Unsiloed doesn't support custom prompts
|
|
1921
|
+
imageExtraction: false,
|
|
1922
|
+
// Unsiloed doesn't extract images
|
|
1923
|
+
pageMarkers: false,
|
|
1924
|
+
// Unsiloed doesn't add page markers
|
|
1925
|
+
citations: d.outputFormat?.features?.citations ?? isExtract,
|
|
1926
|
+
// Extract has citations
|
|
1927
|
+
chunking: d.outputFormat?.features?.semanticChunking ?? isParse,
|
|
1928
|
+
// Parse has semantic chunking
|
|
1929
|
+
segmentation: isSplit,
|
|
1930
|
+
// Split provider does segmentation
|
|
1931
|
+
stripExistingOCR: false,
|
|
1932
|
+
formatLines: false,
|
|
1933
|
+
forceOCR: false,
|
|
1934
|
+
tableOutputFormats: false,
|
|
1935
|
+
tableMerging: false,
|
|
1936
|
+
confidence: d.outputFormat?.features?.confidence ?? false,
|
|
1937
|
+
// Unsiloed may provide confidence
|
|
1938
|
+
boundingBoxes: d.outputFormat?.features?.boundingBoxes ?? false,
|
|
1939
|
+
// Unsiloed may provide bounding boxes
|
|
1940
|
+
schemaValidation: isExtract,
|
|
1941
|
+
// Extract supports schema validation
|
|
1942
|
+
handwrittenText: false,
|
|
1943
|
+
// Unsiloed doesn't specifically advertise handwriting
|
|
1944
|
+
outputFormats
|
|
1945
|
+
};
|
|
1946
|
+
return {
|
|
1947
|
+
id,
|
|
1948
|
+
name: d.name ?? id,
|
|
1949
|
+
source: "unsiloed",
|
|
1950
|
+
type: d.type ?? "OCR",
|
|
1951
|
+
// NEW: 3-layer identity
|
|
1952
|
+
identity: {
|
|
1953
|
+
provider: "unsiloed",
|
|
1954
|
+
model,
|
|
1955
|
+
method: "native"
|
|
1956
|
+
},
|
|
1957
|
+
capabilities: {
|
|
1958
|
+
supportsImages: d.capabilities?.supportsImages ?? true,
|
|
1959
|
+
supportsPDFs: d.capabilities?.supportsPDFs ?? true,
|
|
1960
|
+
supportsDocuments: d.capabilities?.supportsDocuments ?? false,
|
|
1961
|
+
supportsReasoning: false,
|
|
1962
|
+
// Unsiloed doesn't do reasoning
|
|
1963
|
+
supportsStructuredOutput: isVLM || isExtract,
|
|
1964
|
+
// NEW capabilities
|
|
1965
|
+
supportsPrompts: false,
|
|
1966
|
+
// Unsiloed doesn't support custom prompts
|
|
1967
|
+
supportsCitations: d.outputFormat?.features?.citations ?? isExtract,
|
|
1968
|
+
supportsChunking: d.outputFormat?.features?.semanticChunking ?? isParse,
|
|
1969
|
+
supportsImageExtraction: false,
|
|
1970
|
+
supportsPageMarkers: false,
|
|
1971
|
+
supportsLanguageHints: false,
|
|
1972
|
+
supportsProcessingModes: d.capabilities?.specialFeatures?.includes("YOLO segmentation") ?? false,
|
|
1973
|
+
supportsSegmentation: isSplit || isCategorize,
|
|
1974
|
+
outputFormats
|
|
1975
|
+
},
|
|
1976
|
+
features,
|
|
1977
|
+
// Unsiloed providers always need raw document input
|
|
1978
|
+
inputRequirements: {
|
|
1979
|
+
inputType: d.inputRequirements?.inputType ?? "raw-document",
|
|
1980
|
+
acceptedMethods: d.inputRequirements?.acceptedMethods ?? d.inputFormats?.inputMethods ?? ["base64", "url"]
|
|
1981
|
+
},
|
|
1982
|
+
compatibleNodes: {
|
|
1983
|
+
parse: d.compatibleNodes?.parse ?? false,
|
|
1984
|
+
extract: d.compatibleNodes?.extract ?? false,
|
|
1985
|
+
categorize: d.compatibleNodes?.categorize ?? false,
|
|
1986
|
+
qualify: d.compatibleNodes?.qualify ?? false,
|
|
1987
|
+
split: d.compatibleNodes?.split ?? false
|
|
1988
|
+
},
|
|
1989
|
+
inputFormats: {
|
|
1990
|
+
imageMimeTypes: (d.inputFormats?.mimeTypes ?? []).filter((m) => m.startsWith("image/")),
|
|
1991
|
+
documentMimeTypes: (d.inputFormats?.mimeTypes ?? []).filter((m) => !m.startsWith("image/")),
|
|
1992
|
+
inputMethods: d.inputFormats?.inputMethods ?? ["base64"],
|
|
1993
|
+
maxFileSize: d.inputFormats?.maxFileSize,
|
|
1994
|
+
maxPages: d.inputFormats?.maxPages
|
|
1995
|
+
},
|
|
1996
|
+
pricing: {
|
|
1997
|
+
model: "per-page",
|
|
1998
|
+
perPage: d.pricing?.standardUSD ?? d.pricing?.perPage,
|
|
1999
|
+
currency: "USD",
|
|
2000
|
+
notes: d.pricing?.notes
|
|
2001
|
+
},
|
|
2002
|
+
rateLimits: {
|
|
2003
|
+
docsPerMinute: d.apiConfig?.rateLimit?.docsPerMinute
|
|
2004
|
+
},
|
|
2005
|
+
raw: d
|
|
2006
|
+
};
|
|
2007
|
+
}
|
|
2008
|
+
function getProvidersForMimeType(mimeType) {
|
|
2009
|
+
return queryProviders({ mimeType });
|
|
2010
|
+
}
|
|
2011
|
+
function getCheapestProviderFor(capability) {
|
|
2012
|
+
let providers;
|
|
2013
|
+
switch (capability) {
|
|
2014
|
+
case "ocr":
|
|
2015
|
+
case "parse":
|
|
2016
|
+
providers = queryProviders({ compatibleWith: ["parse"] });
|
|
2017
|
+
break;
|
|
2018
|
+
case "extraction":
|
|
2019
|
+
providers = queryProviders({ compatibleWith: ["extract"] });
|
|
2020
|
+
break;
|
|
2021
|
+
}
|
|
2022
|
+
return providers.sort((a, b) => {
|
|
2023
|
+
const costA = a.pricing.perPage ?? (a.pricing.inputPer1kTokens ?? Infinity);
|
|
2024
|
+
const costB = b.pricing.perPage ?? (b.pricing.inputPer1kTokens ?? Infinity);
|
|
2025
|
+
return costA - costB;
|
|
2026
|
+
})[0];
|
|
2027
|
+
}
|
|
2028
|
+
function getProvidersForLargeFiles(minSizeMB = 100) {
|
|
2029
|
+
return queryProviders({ minFileSize: minSizeMB });
|
|
2030
|
+
}
|
|
2031
|
+
var modelRegistry = /* @__PURE__ */ new Map();
|
|
2032
|
+
function registerProviderWithModels(providerId, metadata) {
|
|
2033
|
+
modelRegistry.set(providerId, metadata);
|
|
2034
|
+
}
|
|
2035
|
+
function resolveModelMetadata(providerId, modelId) {
|
|
2036
|
+
const providerWithModels = modelRegistry.get(providerId);
|
|
2037
|
+
if (providerWithModels) {
|
|
2038
|
+
return resolveFromProviderWithModels(providerWithModels, modelId);
|
|
2039
|
+
}
|
|
2040
|
+
const provider = getProviderById(providerId);
|
|
2041
|
+
if (!provider) return void 0;
|
|
2042
|
+
return {
|
|
2043
|
+
modelId: modelId ?? providerId,
|
|
2044
|
+
modelName: modelId ?? provider.name,
|
|
2045
|
+
providerId: provider.id,
|
|
2046
|
+
providerName: provider.name,
|
|
2047
|
+
providerSource: provider.source,
|
|
2048
|
+
capabilities: { ...provider.capabilities },
|
|
2049
|
+
features: { ...provider.features },
|
|
2050
|
+
inputRequirements: { ...provider.inputRequirements },
|
|
2051
|
+
compatibleNodes: { ...provider.compatibleNodes },
|
|
2052
|
+
pricing: { ...provider.pricing }
|
|
2053
|
+
};
|
|
2054
|
+
}
|
|
2055
|
+
function resolveFromProviderWithModels(provider, modelId) {
|
|
2056
|
+
const model = modelId ? provider.models?.find((m) => m.id === modelId) : void 0;
|
|
2057
|
+
return {
|
|
2058
|
+
modelId: model?.id ?? modelId ?? provider.id,
|
|
2059
|
+
modelName: model?.name ?? model?.id ?? modelId ?? provider.name,
|
|
2060
|
+
openRouterId: model?.openRouterId,
|
|
2061
|
+
providerId: provider.id,
|
|
2062
|
+
providerName: provider.name,
|
|
2063
|
+
providerSource: provider.source,
|
|
2064
|
+
// Merge capabilities (model overrides provider)
|
|
2065
|
+
capabilities: {
|
|
2066
|
+
supportsImages: model?.capabilities?.supportsImages ?? provider.capabilities.supportsImages,
|
|
2067
|
+
supportsPDFs: model?.capabilities?.supportsPDFs ?? provider.capabilities.supportsPDFs,
|
|
2068
|
+
supportsDocuments: model?.capabilities?.supportsDocuments ?? provider.capabilities.supportsDocuments,
|
|
2069
|
+
supportsReasoning: model?.capabilities?.supportsReasoning ?? provider.capabilities.supportsReasoning,
|
|
2070
|
+
supportsStructuredOutput: model?.capabilities?.supportsStructuredOutput ?? provider.capabilities.supportsStructuredOutput,
|
|
2071
|
+
// NEW capabilities
|
|
2072
|
+
supportsPrompts: model?.capabilities?.supportsPrompts ?? provider.capabilities.supportsPrompts,
|
|
2073
|
+
supportsCitations: model?.capabilities?.supportsCitations ?? provider.capabilities.supportsCitations,
|
|
2074
|
+
supportsChunking: model?.capabilities?.supportsChunking ?? provider.capabilities.supportsChunking,
|
|
2075
|
+
supportsImageExtraction: model?.capabilities?.supportsImageExtraction ?? provider.capabilities.supportsImageExtraction,
|
|
2076
|
+
supportsPageMarkers: model?.capabilities?.supportsPageMarkers ?? provider.capabilities.supportsPageMarkers,
|
|
2077
|
+
supportsLanguageHints: model?.capabilities?.supportsLanguageHints ?? provider.capabilities.supportsLanguageHints,
|
|
2078
|
+
supportsProcessingModes: model?.capabilities?.supportsProcessingModes ?? provider.capabilities.supportsProcessingModes,
|
|
2079
|
+
supportsSegmentation: model?.capabilities?.supportsSegmentation ?? provider.capabilities.supportsSegmentation,
|
|
2080
|
+
outputFormats: model?.capabilities?.outputFormats ?? provider.capabilities.outputFormats
|
|
2081
|
+
},
|
|
2082
|
+
// Merge input requirements
|
|
2083
|
+
inputRequirements: {
|
|
2084
|
+
inputType: model?.inputRequirements?.inputType ?? provider.inputRequirements.inputType,
|
|
2085
|
+
acceptedMethods: model?.inputRequirements?.acceptedMethods ?? provider.inputRequirements.acceptedMethods
|
|
2086
|
+
},
|
|
2087
|
+
// Merge node compatibility
|
|
2088
|
+
compatibleNodes: {
|
|
2089
|
+
parse: model?.compatibleNodes?.parse ?? provider.compatibleNodes.parse,
|
|
2090
|
+
extract: model?.compatibleNodes?.extract ?? provider.compatibleNodes.extract,
|
|
2091
|
+
categorize: model?.compatibleNodes?.categorize ?? provider.compatibleNodes.categorize,
|
|
2092
|
+
qualify: model?.compatibleNodes?.qualify ?? provider.compatibleNodes.qualify,
|
|
2093
|
+
split: model?.compatibleNodes?.split ?? provider.compatibleNodes.split
|
|
2094
|
+
},
|
|
2095
|
+
// Features (inherited from provider - models don't override features)
|
|
2096
|
+
features: { ...provider.features },
|
|
2097
|
+
// Merge pricing
|
|
2098
|
+
pricing: {
|
|
2099
|
+
model: provider.pricing.model,
|
|
2100
|
+
inputPer1kTokens: model?.pricing?.inputPer1kTokens ?? provider.pricing.inputPer1kTokens,
|
|
2101
|
+
outputPer1kTokens: model?.pricing?.outputPer1kTokens ?? provider.pricing.outputPer1kTokens,
|
|
2102
|
+
perPage: model?.pricing?.perPage ?? provider.pricing.perPage,
|
|
2103
|
+
currency: provider.pricing.currency,
|
|
2104
|
+
notes: provider.pricing.notes
|
|
2105
|
+
},
|
|
2106
|
+
// Model limits
|
|
2107
|
+
limits: model?.limits
|
|
2108
|
+
};
|
|
2109
|
+
}
|
|
2110
|
+
function queryModels(filter = {}) {
|
|
2111
|
+
const results = [];
|
|
2112
|
+
for (const [providerId, provider] of modelRegistry) {
|
|
2113
|
+
if (filter.providerId) {
|
|
2114
|
+
const providerIds = Array.isArray(filter.providerId) ? filter.providerId : [filter.providerId];
|
|
2115
|
+
if (!providerIds.includes(providerId)) continue;
|
|
2116
|
+
}
|
|
2117
|
+
if (filter.source) {
|
|
2118
|
+
const sources = Array.isArray(filter.source) ? filter.source : [filter.source];
|
|
2119
|
+
if (!sources.includes(provider.source)) continue;
|
|
2120
|
+
}
|
|
2121
|
+
const models = provider.models ?? [{ id: provider.id }];
|
|
2122
|
+
for (const model of models) {
|
|
2123
|
+
const resolved = resolveFromProviderWithModels(provider, model.id);
|
|
2124
|
+
if (matchesModelFilter(resolved, filter)) {
|
|
2125
|
+
results.push(resolved);
|
|
2126
|
+
}
|
|
2127
|
+
}
|
|
2128
|
+
}
|
|
2129
|
+
for (const provider of getAllProviders()) {
|
|
2130
|
+
if (modelRegistry.has(provider.id)) continue;
|
|
2131
|
+
if (filter.providerId) {
|
|
2132
|
+
const providerIds = Array.isArray(filter.providerId) ? filter.providerId : [filter.providerId];
|
|
2133
|
+
if (!providerIds.includes(provider.id)) continue;
|
|
2134
|
+
}
|
|
2135
|
+
if (filter.source) {
|
|
2136
|
+
const sources = Array.isArray(filter.source) ? filter.source : [filter.source];
|
|
2137
|
+
if (!sources.includes(provider.source)) continue;
|
|
2138
|
+
}
|
|
2139
|
+
const resolved = resolveModelMetadata(provider.id);
|
|
2140
|
+
if (resolved && matchesModelFilter(resolved, filter)) {
|
|
2141
|
+
results.push(resolved);
|
|
2142
|
+
}
|
|
2143
|
+
}
|
|
2144
|
+
return results;
|
|
2145
|
+
}
|
|
2146
|
+
function matchesModelFilter(model, filter) {
|
|
2147
|
+
if (filter.supports) {
|
|
2148
|
+
if (filter.supports.images !== void 0 && model.capabilities.supportsImages !== filter.supports.images) {
|
|
2149
|
+
return false;
|
|
2150
|
+
}
|
|
2151
|
+
if (filter.supports.pdfs !== void 0 && model.capabilities.supportsPDFs !== filter.supports.pdfs) {
|
|
2152
|
+
return false;
|
|
2153
|
+
}
|
|
2154
|
+
if (filter.supports.documents !== void 0 && model.capabilities.supportsDocuments !== filter.supports.documents) {
|
|
2155
|
+
return false;
|
|
2156
|
+
}
|
|
2157
|
+
if (filter.supports.reasoning !== void 0 && model.capabilities.supportsReasoning !== filter.supports.reasoning) {
|
|
2158
|
+
return false;
|
|
2159
|
+
}
|
|
2160
|
+
if (filter.supports.structuredOutput !== void 0 && model.capabilities.supportsStructuredOutput !== filter.supports.structuredOutput) {
|
|
2161
|
+
return false;
|
|
2162
|
+
}
|
|
2163
|
+
if (filter.supports.prompts !== void 0 && model.capabilities.supportsPrompts !== filter.supports.prompts) {
|
|
2164
|
+
return false;
|
|
2165
|
+
}
|
|
2166
|
+
if (filter.supports.citations !== void 0 && model.capabilities.supportsCitations !== filter.supports.citations) {
|
|
2167
|
+
return false;
|
|
2168
|
+
}
|
|
2169
|
+
if (filter.supports.chunking !== void 0 && model.capabilities.supportsChunking !== filter.supports.chunking) {
|
|
2170
|
+
return false;
|
|
2171
|
+
}
|
|
2172
|
+
if (filter.supports.imageExtraction !== void 0 && model.capabilities.supportsImageExtraction !== filter.supports.imageExtraction) {
|
|
2173
|
+
return false;
|
|
2174
|
+
}
|
|
2175
|
+
if (filter.supports.pageMarkers !== void 0 && model.capabilities.supportsPageMarkers !== filter.supports.pageMarkers) {
|
|
2176
|
+
return false;
|
|
2177
|
+
}
|
|
2178
|
+
if (filter.supports.languageHints !== void 0 && model.capabilities.supportsLanguageHints !== filter.supports.languageHints) {
|
|
2179
|
+
return false;
|
|
2180
|
+
}
|
|
2181
|
+
if (filter.supports.processingModes !== void 0 && model.capabilities.supportsProcessingModes !== filter.supports.processingModes) {
|
|
2182
|
+
return false;
|
|
2183
|
+
}
|
|
2184
|
+
if (filter.supports.segmentation !== void 0 && model.capabilities.supportsSegmentation !== filter.supports.segmentation) {
|
|
2185
|
+
return false;
|
|
2186
|
+
}
|
|
2187
|
+
}
|
|
2188
|
+
if (filter.hasFeatures && filter.hasFeatures.length > 0) {
|
|
2189
|
+
for (const feature of filter.hasFeatures) {
|
|
2190
|
+
if (model.features[feature] !== true) {
|
|
2191
|
+
return false;
|
|
2192
|
+
}
|
|
2193
|
+
}
|
|
2194
|
+
}
|
|
2195
|
+
if (filter.outputFormat) {
|
|
2196
|
+
if (model.capabilities.outputFormats[filter.outputFormat] !== true) {
|
|
2197
|
+
return false;
|
|
2198
|
+
}
|
|
2199
|
+
}
|
|
2200
|
+
if (filter.inputRequirements?.inputType !== void 0) {
|
|
2201
|
+
const inputTypes = Array.isArray(filter.inputRequirements.inputType) ? filter.inputRequirements.inputType : [filter.inputRequirements.inputType];
|
|
2202
|
+
if (!inputTypes.includes(model.inputRequirements.inputType)) {
|
|
2203
|
+
return false;
|
|
2204
|
+
}
|
|
2205
|
+
}
|
|
2206
|
+
if (filter.compatibleWith && filter.compatibleWith.length > 0) {
|
|
2207
|
+
for (const node2 of filter.compatibleWith) {
|
|
2208
|
+
if (!model.compatibleNodes[node2]) {
|
|
2209
|
+
return false;
|
|
2210
|
+
}
|
|
2211
|
+
}
|
|
2212
|
+
}
|
|
2213
|
+
if (filter.minContextTokens !== void 0) {
|
|
2214
|
+
const contextTokens = model.limits?.maxContextTokens ?? 0;
|
|
2215
|
+
if (contextTokens < filter.minContextTokens) {
|
|
2216
|
+
return false;
|
|
2217
|
+
}
|
|
2218
|
+
}
|
|
2219
|
+
if (filter.filter && !filter.filter(model)) {
|
|
2220
|
+
return false;
|
|
2221
|
+
}
|
|
2222
|
+
return true;
|
|
2223
|
+
}
|
|
2224
|
+
function getModelsForNode(nodeType) {
|
|
2225
|
+
return queryModels({ compatibleWith: [nodeType] });
|
|
2226
|
+
}
|
|
2227
|
+
function getAllModels() {
|
|
2228
|
+
return queryModels({});
|
|
2229
|
+
}
|
|
2230
|
+
function clearModelRegistry() {
|
|
2231
|
+
modelRegistry.clear();
|
|
2232
|
+
}
|
|
2233
|
+
export {
|
|
2234
|
+
FlowExecutionError,
|
|
2235
|
+
FlowInputValidationError,
|
|
2236
|
+
FlowValidationError,
|
|
2237
|
+
NODE_COMPATIBILITY_MATRIX,
|
|
2238
|
+
RESERVED_VARIABLES,
|
|
2239
|
+
aggregateMetrics,
|
|
2240
|
+
bufferToBase64,
|
|
2241
|
+
bufferToDataUri,
|
|
2242
|
+
buildProviderFromConfig,
|
|
2243
|
+
buildProvidersFromConfigs,
|
|
2244
|
+
canStartForEachItemFlow,
|
|
2245
|
+
clearModelRegistry,
|
|
2246
|
+
clearProviderRegistry,
|
|
2247
|
+
createIdentity,
|
|
2248
|
+
defineMarkerProvider,
|
|
2249
|
+
defineSuryaProvider,
|
|
2250
|
+
defineVLMProvider,
|
|
2251
|
+
detectDocumentType,
|
|
2252
|
+
detectMimeTypeFromBase64,
|
|
2253
|
+
detectMimeTypeFromBase64Async,
|
|
2254
|
+
detectMimeTypeFromBytes,
|
|
2255
|
+
extractBase64,
|
|
2256
|
+
getAllModels,
|
|
2257
|
+
getAllProviders,
|
|
2258
|
+
getCheapestProviderFor,
|
|
2259
|
+
getCompatibleTargets,
|
|
2260
|
+
getDocumentPageCount,
|
|
2261
|
+
getModelsForNode,
|
|
2262
|
+
getNodeTypeInfo,
|
|
2263
|
+
getNodeTypeName,
|
|
2264
|
+
getPDFPageCount,
|
|
2265
|
+
getPageCountMetadata,
|
|
2266
|
+
getProviderById,
|
|
2267
|
+
getProvidersBySource,
|
|
2268
|
+
getProvidersForLargeFiles,
|
|
2269
|
+
getProvidersForMimeType,
|
|
2270
|
+
getSuggestedConnections,
|
|
2271
|
+
getTotalPageCount,
|
|
2272
|
+
getValidForEachStarters,
|
|
2273
|
+
isLocalEndpoint,
|
|
2274
|
+
isPDFDocument,
|
|
2275
|
+
node,
|
|
2276
|
+
parseProviderString,
|
|
2277
|
+
protectReservedVariables,
|
|
2278
|
+
queryModels,
|
|
2279
|
+
queryProviders,
|
|
2280
|
+
registerProviderMetadata,
|
|
2281
|
+
registerProviderWithModels,
|
|
2282
|
+
resolveDocument,
|
|
2283
|
+
resolveModelMetadata,
|
|
2284
|
+
runPipeline,
|
|
2285
|
+
splitPDFIntoChunks,
|
|
2286
|
+
toProviderString,
|
|
2287
|
+
validateFlowInputFormat,
|
|
2288
|
+
validateJson,
|
|
2289
|
+
validateMimeType,
|
|
2290
|
+
validateMimeTypeAsync,
|
|
2291
|
+
validateNodeConnection
|
|
2292
|
+
};
|
|
2293
|
+
//# sourceMappingURL=index.js.map
|