@struktur/sdk 2.1.2 → 2.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +4111 -0
- package/dist/index.js.map +1 -0
- package/dist/parsers.js +492 -0
- package/dist/parsers.js.map +1 -0
- package/dist/strategies.js +2435 -0
- package/dist/strategies.js.map +1 -0
- package/package.json +24 -12
- package/src/agent-cli-integration.test.ts +0 -47
- package/src/agent-export.test.ts +0 -17
- package/src/agent-tool-labels.test.ts +0 -50
- package/src/artifacts/AGENTS.md +0 -16
- package/src/artifacts/fileToArtifact.test.ts +0 -37
- package/src/artifacts/fileToArtifact.ts +0 -44
- package/src/artifacts/input.test.ts +0 -243
- package/src/artifacts/input.ts +0 -360
- package/src/artifacts/providers.test.ts +0 -19
- package/src/artifacts/providers.ts +0 -7
- package/src/artifacts/urlToArtifact.test.ts +0 -23
- package/src/artifacts/urlToArtifact.ts +0 -19
- package/src/auth/AGENTS.md +0 -11
- package/src/auth/config.test.ts +0 -132
- package/src/auth/config.ts +0 -186
- package/src/auth/tokens.test.ts +0 -58
- package/src/auth/tokens.ts +0 -229
- package/src/chunking/AGENTS.md +0 -11
- package/src/chunking/ArtifactBatcher.test.ts +0 -22
- package/src/chunking/ArtifactBatcher.ts +0 -110
- package/src/chunking/ArtifactSplitter.test.ts +0 -38
- package/src/chunking/ArtifactSplitter.ts +0 -151
- package/src/debug/AGENTS.md +0 -79
- package/src/debug/logger.test.ts +0 -244
- package/src/debug/logger.ts +0 -211
- package/src/extract.test.ts +0 -22
- package/src/extract.ts +0 -150
- package/src/fields.test.ts +0 -681
- package/src/fields.ts +0 -246
- package/src/index.test.ts +0 -20
- package/src/index.ts +0 -110
- package/src/llm/AGENTS.md +0 -9
- package/src/llm/LLMClient.test.ts +0 -394
- package/src/llm/LLMClient.ts +0 -264
- package/src/llm/RetryingRunner.test.ts +0 -174
- package/src/llm/RetryingRunner.ts +0 -270
- package/src/llm/message.test.ts +0 -42
- package/src/llm/message.ts +0 -47
- package/src/llm/models.test.ts +0 -82
- package/src/llm/models.ts +0 -190
- package/src/llm/resolveModel.ts +0 -86
- package/src/merge/AGENTS.md +0 -6
- package/src/merge/Deduplicator.test.ts +0 -108
- package/src/merge/Deduplicator.ts +0 -45
- package/src/merge/SmartDataMerger.test.ts +0 -177
- package/src/merge/SmartDataMerger.ts +0 -56
- package/src/parsers/AGENTS.md +0 -58
- package/src/parsers/collect.test.ts +0 -56
- package/src/parsers/collect.ts +0 -31
- package/src/parsers/index.ts +0 -6
- package/src/parsers/mime.test.ts +0 -91
- package/src/parsers/mime.ts +0 -137
- package/src/parsers/npm.ts +0 -26
- package/src/parsers/pdf.test.ts +0 -394
- package/src/parsers/pdf.ts +0 -194
- package/src/parsers/runner.test.ts +0 -95
- package/src/parsers/runner.ts +0 -177
- package/src/parsers/types.ts +0 -29
- package/src/prompts/AGENTS.md +0 -8
- package/src/prompts/DeduplicationPrompt.test.ts +0 -41
- package/src/prompts/DeduplicationPrompt.ts +0 -37
- package/src/prompts/ExtractorPrompt.test.ts +0 -21
- package/src/prompts/ExtractorPrompt.ts +0 -72
- package/src/prompts/ParallelMergerPrompt.test.ts +0 -8
- package/src/prompts/ParallelMergerPrompt.ts +0 -37
- package/src/prompts/SequentialExtractorPrompt.test.ts +0 -24
- package/src/prompts/SequentialExtractorPrompt.ts +0 -82
- package/src/prompts/formatArtifacts.test.ts +0 -39
- package/src/prompts/formatArtifacts.ts +0 -46
- package/src/strategies/AGENTS.md +0 -6
- package/src/strategies/DoublePassAutoMergeStrategy.test.ts +0 -53
- package/src/strategies/DoublePassAutoMergeStrategy.ts +0 -410
- package/src/strategies/DoublePassStrategy.test.ts +0 -48
- package/src/strategies/DoublePassStrategy.ts +0 -266
- package/src/strategies/ParallelAutoMergeStrategy.test.ts +0 -152
- package/src/strategies/ParallelAutoMergeStrategy.ts +0 -345
- package/src/strategies/ParallelStrategy.test.ts +0 -61
- package/src/strategies/ParallelStrategy.ts +0 -208
- package/src/strategies/SequentialAutoMergeStrategy.test.ts +0 -66
- package/src/strategies/SequentialAutoMergeStrategy.ts +0 -325
- package/src/strategies/SequentialStrategy.test.ts +0 -53
- package/src/strategies/SequentialStrategy.ts +0 -142
- package/src/strategies/SimpleStrategy.test.ts +0 -46
- package/src/strategies/SimpleStrategy.ts +0 -94
- package/src/strategies/concurrency.test.ts +0 -16
- package/src/strategies/concurrency.ts +0 -14
- package/src/strategies/index.test.ts +0 -20
- package/src/strategies/index.ts +0 -7
- package/src/strategies/utils.test.ts +0 -76
- package/src/strategies/utils.ts +0 -95
- package/src/tokenization.test.ts +0 -119
- package/src/tokenization.ts +0 -71
- package/src/types.test.ts +0 -25
- package/src/types.ts +0 -174
- package/src/validation/AGENTS.md +0 -7
- package/src/validation/validator.test.ts +0 -204
- package/src/validation/validator.ts +0 -90
- package/tsconfig.json +0 -22
package/dist/index.js
ADDED
|
@@ -0,0 +1,4111 @@
|
|
|
1
|
+
var __defProp = Object.defineProperty;
|
|
2
|
+
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
3
|
+
var __esm = (fn, res) => function __init() {
|
|
4
|
+
return fn && (res = (0, fn[__getOwnPropNames(fn)[0]])(fn = 0)), res;
|
|
5
|
+
};
|
|
6
|
+
var __export = (target, all) => {
|
|
7
|
+
for (var name in all)
|
|
8
|
+
__defProp(target, name, { get: all[name], enumerable: true });
|
|
9
|
+
};
|
|
10
|
+
|
|
11
|
+
// src/parsers/collect.ts
|
|
12
|
+
async function collectStream(stream) {
|
|
13
|
+
const reader = stream.getReader();
|
|
14
|
+
const chunks = [];
|
|
15
|
+
try {
|
|
16
|
+
while (true) {
|
|
17
|
+
const { done, value } = await reader.read();
|
|
18
|
+
if (done) {
|
|
19
|
+
break;
|
|
20
|
+
}
|
|
21
|
+
chunks.push(value);
|
|
22
|
+
}
|
|
23
|
+
} finally {
|
|
24
|
+
reader.releaseLock();
|
|
25
|
+
}
|
|
26
|
+
const totalLength = chunks.reduce((sum, chunk) => sum + chunk.length, 0);
|
|
27
|
+
const result = new Uint8Array(totalLength);
|
|
28
|
+
let offset = 0;
|
|
29
|
+
for (const chunk of chunks) {
|
|
30
|
+
result.set(chunk, offset);
|
|
31
|
+
offset += chunk.length;
|
|
32
|
+
}
|
|
33
|
+
return Buffer.from(result);
|
|
34
|
+
}
|
|
35
|
+
var init_collect = __esm({
|
|
36
|
+
"src/parsers/collect.ts"() {
|
|
37
|
+
"use strict";
|
|
38
|
+
}
|
|
39
|
+
});
|
|
40
|
+
|
|
41
|
+
// src/parsers/pdf.ts
|
|
42
|
+
var pdf_exports = {};
|
|
43
|
+
__export(pdf_exports, {
|
|
44
|
+
parsePdf: () => parsePdf
|
|
45
|
+
});
|
|
46
|
+
async function parsePdf(input, options) {
|
|
47
|
+
const buffer = Buffer.isBuffer(input) ? input : await collectStream(input);
|
|
48
|
+
const { PDFParse } = await import("pdf-parse");
|
|
49
|
+
const parser = new PDFParse({ data: buffer });
|
|
50
|
+
const textResult = await parser.getText();
|
|
51
|
+
const pageTextMap = /* @__PURE__ */ new Map();
|
|
52
|
+
if (textResult.pages.length > 0) {
|
|
53
|
+
for (const page of textResult.pages) {
|
|
54
|
+
if (page.text && page.text.trim().length > 0) {
|
|
55
|
+
pageTextMap.set(page.num, page.text);
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
let imageResult;
|
|
60
|
+
if (options?.includeImages !== false) {
|
|
61
|
+
try {
|
|
62
|
+
imageResult = await parser.getImage({ imageBuffer: false, imageDataUrl: true });
|
|
63
|
+
} catch {
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
let screenshotResult;
|
|
67
|
+
if (options?.screenshots === true) {
|
|
68
|
+
try {
|
|
69
|
+
const screenshotParams = { imageBuffer: false, imageDataUrl: true };
|
|
70
|
+
if (options.screenshotWidth !== void 0) {
|
|
71
|
+
screenshotParams.desiredWidth = options.screenshotWidth;
|
|
72
|
+
} else {
|
|
73
|
+
screenshotParams.scale = options.screenshotScale ?? 1.5;
|
|
74
|
+
}
|
|
75
|
+
screenshotResult = await parser.getScreenshot(screenshotParams);
|
|
76
|
+
} catch {
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
const pageImageMap = /* @__PURE__ */ new Map();
|
|
80
|
+
if (imageResult) {
|
|
81
|
+
for (const pageImages of imageResult.pages) {
|
|
82
|
+
const artifactImages = pageImages.images.filter((img) => img.dataUrl).map((img) => {
|
|
83
|
+
const base64 = img.dataUrl.replace(/^data:[^;]+;base64,/, "");
|
|
84
|
+
const artifactImage = {
|
|
85
|
+
type: "image",
|
|
86
|
+
base64,
|
|
87
|
+
width: img.width,
|
|
88
|
+
height: img.height,
|
|
89
|
+
imageType: "embedded"
|
|
90
|
+
};
|
|
91
|
+
return artifactImage;
|
|
92
|
+
});
|
|
93
|
+
if (artifactImages.length > 0) {
|
|
94
|
+
pageImageMap.set(pageImages.pageNumber, artifactImages);
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
if (screenshotResult) {
|
|
99
|
+
for (const screenshot of screenshotResult.pages) {
|
|
100
|
+
if (screenshot.dataUrl) {
|
|
101
|
+
const base64 = screenshot.dataUrl.replace(/^data:[^;]+;base64,/, "");
|
|
102
|
+
const artifactImage = {
|
|
103
|
+
type: "image",
|
|
104
|
+
base64,
|
|
105
|
+
width: screenshot.width,
|
|
106
|
+
height: screenshot.height,
|
|
107
|
+
imageType: "screenshot"
|
|
108
|
+
};
|
|
109
|
+
const existing = pageImageMap.get(screenshot.pageNumber) ?? [];
|
|
110
|
+
pageImageMap.set(screenshot.pageNumber, [...existing, artifactImage]);
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
let contents;
|
|
115
|
+
if (textResult.pages.length > 0) {
|
|
116
|
+
const allPageNums = /* @__PURE__ */ new Set([
|
|
117
|
+
...pageTextMap.keys(),
|
|
118
|
+
...pageImageMap.keys()
|
|
119
|
+
]);
|
|
120
|
+
contents = Array.from(allPageNums).sort((a, b) => a - b).map((pageNum) => {
|
|
121
|
+
const entry = { page: pageNum };
|
|
122
|
+
const text = pageTextMap.get(pageNum);
|
|
123
|
+
if (text) entry.text = text;
|
|
124
|
+
const media = pageImageMap.get(pageNum);
|
|
125
|
+
if (media) entry.media = media;
|
|
126
|
+
return entry;
|
|
127
|
+
});
|
|
128
|
+
} else {
|
|
129
|
+
const entry = { text: textResult.text };
|
|
130
|
+
const firstPageImages = pageImageMap.size > 0 ? pageImageMap.values().next().value : void 0;
|
|
131
|
+
if (firstPageImages) entry.media = firstPageImages;
|
|
132
|
+
contents = [entry];
|
|
133
|
+
}
|
|
134
|
+
if (contents.length === 0) {
|
|
135
|
+
contents = [{ text: "" }];
|
|
136
|
+
}
|
|
137
|
+
let infoResult;
|
|
138
|
+
try {
|
|
139
|
+
infoResult = await parser.getInfo();
|
|
140
|
+
} catch {
|
|
141
|
+
}
|
|
142
|
+
await parser.destroy();
|
|
143
|
+
return {
|
|
144
|
+
id: `artifact-${crypto.randomUUID()}`,
|
|
145
|
+
type: "pdf",
|
|
146
|
+
raw: async () => buffer,
|
|
147
|
+
contents,
|
|
148
|
+
metadata: infoResult ? {
|
|
149
|
+
numpages: textResult.total,
|
|
150
|
+
info: infoResult
|
|
151
|
+
} : { numpages: textResult.total }
|
|
152
|
+
};
|
|
153
|
+
}
|
|
154
|
+
var init_pdf = __esm({
|
|
155
|
+
"src/parsers/pdf.ts"() {
|
|
156
|
+
"use strict";
|
|
157
|
+
init_collect();
|
|
158
|
+
}
|
|
159
|
+
});
|
|
160
|
+
|
|
161
|
+
// src/fields.ts
|
|
162
|
+
var SCALAR_TYPES = /* @__PURE__ */ new Set([
|
|
163
|
+
"string",
|
|
164
|
+
"number",
|
|
165
|
+
"boolean",
|
|
166
|
+
"integer",
|
|
167
|
+
"int"
|
|
168
|
+
]);
|
|
169
|
+
var SCALAR_ALIASES = {
|
|
170
|
+
bool: "boolean",
|
|
171
|
+
float: "number"
|
|
172
|
+
// Note: "int" stays as "int" (not aliased to "integer") so the schema
|
|
173
|
+
// builder can emit the extra multipleOf:1 constraint.
|
|
174
|
+
};
|
|
175
|
+
var extractBraces = (rawType, prefix) => {
|
|
176
|
+
if (!rawType.startsWith(prefix + "{") || !rawType.endsWith("}")) {
|
|
177
|
+
return null;
|
|
178
|
+
}
|
|
179
|
+
return rawType.slice(prefix.length + 1, -1);
|
|
180
|
+
};
|
|
181
|
+
var parseScalarType = (raw, fieldName) => {
|
|
182
|
+
const resolved = SCALAR_ALIASES[raw] ?? raw;
|
|
183
|
+
if (!SCALAR_TYPES.has(resolved)) {
|
|
184
|
+
const allNames = [...Object.keys(SCALAR_ALIASES), ...SCALAR_TYPES].sort();
|
|
185
|
+
throw new Error(
|
|
186
|
+
`Unknown type "${raw}" for field "${fieldName}". Scalar types: ${allNames.join(", ")}. Complex types: enum{a|b|c}, array{string}, or array (shorthand for array{string}).`
|
|
187
|
+
);
|
|
188
|
+
}
|
|
189
|
+
return resolved;
|
|
190
|
+
};
|
|
191
|
+
var parseField = (token) => {
|
|
192
|
+
const colonIndex = token.indexOf(":");
|
|
193
|
+
if (colonIndex === -1) {
|
|
194
|
+
const name2 = token.trim();
|
|
195
|
+
if (!name2) throw new Error("Empty field name in fields string.");
|
|
196
|
+
return { name: name2, kind: "scalar", type: "string" };
|
|
197
|
+
}
|
|
198
|
+
const name = token.slice(0, colonIndex).trim();
|
|
199
|
+
const rawType = token.slice(colonIndex + 1).trim();
|
|
200
|
+
if (!name) {
|
|
201
|
+
throw new Error(`Empty field name before colon in token: "${token}".`);
|
|
202
|
+
}
|
|
203
|
+
if (!rawType) {
|
|
204
|
+
throw new Error(
|
|
205
|
+
`Empty type after colon for field "${name}". Omit the colon or specify a type.`
|
|
206
|
+
);
|
|
207
|
+
}
|
|
208
|
+
const enumContent = extractBraces(rawType, "enum");
|
|
209
|
+
if (enumContent !== null) {
|
|
210
|
+
const values = enumContent.split("|").map((v) => v.trim()).filter(Boolean);
|
|
211
|
+
if (values.length < 2) {
|
|
212
|
+
throw new Error(
|
|
213
|
+
`enum for field "${name}" must have at least two values separated by "|", got: "${enumContent}".`
|
|
214
|
+
);
|
|
215
|
+
}
|
|
216
|
+
return { name, kind: "enum", values };
|
|
217
|
+
}
|
|
218
|
+
const arrayContent = extractBraces(rawType, "array");
|
|
219
|
+
if (arrayContent !== null) {
|
|
220
|
+
const itemType = arrayContent.trim();
|
|
221
|
+
if (!itemType) {
|
|
222
|
+
throw new Error(
|
|
223
|
+
`array for field "${name}" requires an item type, e.g. array{string}.`
|
|
224
|
+
);
|
|
225
|
+
}
|
|
226
|
+
return { name, kind: "array", items: parseScalarType(itemType, name) };
|
|
227
|
+
}
|
|
228
|
+
if (rawType === "array") {
|
|
229
|
+
return { name, kind: "array", items: "string" };
|
|
230
|
+
}
|
|
231
|
+
return { name, kind: "scalar", type: parseScalarType(rawType, name) };
|
|
232
|
+
};
|
|
233
|
+
var parseFieldsString = (fields) => {
|
|
234
|
+
if (!fields.trim()) {
|
|
235
|
+
throw new Error("Fields string must not be empty.");
|
|
236
|
+
}
|
|
237
|
+
const tokens = [];
|
|
238
|
+
let depth = 0;
|
|
239
|
+
let current = "";
|
|
240
|
+
for (const ch of fields) {
|
|
241
|
+
if (ch === "{") {
|
|
242
|
+
depth++;
|
|
243
|
+
current += ch;
|
|
244
|
+
} else if (ch === "}") {
|
|
245
|
+
depth--;
|
|
246
|
+
current += ch;
|
|
247
|
+
} else if (ch === "," && depth === 0) {
|
|
248
|
+
tokens.push(current);
|
|
249
|
+
current = "";
|
|
250
|
+
} else {
|
|
251
|
+
current += ch;
|
|
252
|
+
}
|
|
253
|
+
}
|
|
254
|
+
if (current) tokens.push(current);
|
|
255
|
+
if (depth !== 0) {
|
|
256
|
+
throw new Error("Unmatched braces in fields string.");
|
|
257
|
+
}
|
|
258
|
+
return tokens.map((token) => parseField(token));
|
|
259
|
+
};
|
|
260
|
+
var buildSchemaFromParsedFields = (fields) => {
|
|
261
|
+
if (fields.length === 0) {
|
|
262
|
+
throw new Error("Cannot build a schema from an empty fields list.");
|
|
263
|
+
}
|
|
264
|
+
const properties = {};
|
|
265
|
+
const required = [];
|
|
266
|
+
for (const field of fields) {
|
|
267
|
+
if (field.kind === "scalar") {
|
|
268
|
+
properties[field.name] = field.type === "int" ? { type: "integer", multipleOf: 1 } : { type: field.type };
|
|
269
|
+
} else if (field.kind === "enum") {
|
|
270
|
+
properties[field.name] = { type: "string", enum: field.values };
|
|
271
|
+
} else {
|
|
272
|
+
properties[field.name] = { type: "array", items: field.items === "int" ? { type: "integer", multipleOf: 1 } : { type: field.items } };
|
|
273
|
+
}
|
|
274
|
+
required.push(field.name);
|
|
275
|
+
}
|
|
276
|
+
return {
|
|
277
|
+
type: "object",
|
|
278
|
+
properties,
|
|
279
|
+
required,
|
|
280
|
+
additionalProperties: false
|
|
281
|
+
};
|
|
282
|
+
};
|
|
283
|
+
var buildSchemaFromFields = (fields) => buildSchemaFromParsedFields(parseFieldsString(fields));
|
|
284
|
+
|
|
285
|
+
// src/extract.ts
|
|
286
|
+
var emptyUsage = { inputTokens: 0, outputTokens: 0, totalTokens: 0 };
|
|
287
|
+
var safeShutdown = async (telemetry) => {
|
|
288
|
+
if (!telemetry) return;
|
|
289
|
+
try {
|
|
290
|
+
await telemetry.shutdown();
|
|
291
|
+
} catch {
|
|
292
|
+
}
|
|
293
|
+
};
|
|
294
|
+
var safeEndSpan = (telemetry, span, result) => {
|
|
295
|
+
if (!telemetry || !span) return;
|
|
296
|
+
try {
|
|
297
|
+
telemetry.endSpan(span, result);
|
|
298
|
+
} catch {
|
|
299
|
+
}
|
|
300
|
+
};
|
|
301
|
+
var resolveSchema = (options) => {
|
|
302
|
+
const hasSchema = options.schema !== void 0;
|
|
303
|
+
const hasFields = options.fields !== void 0;
|
|
304
|
+
if (hasSchema && hasFields) {
|
|
305
|
+
throw new Error(
|
|
306
|
+
"Provide either `schema` or `fields`, not both. They are mutually exclusive."
|
|
307
|
+
);
|
|
308
|
+
}
|
|
309
|
+
if (!hasSchema && !hasFields) {
|
|
310
|
+
throw new Error(
|
|
311
|
+
"A schema definition is required. Provide `schema` (a JSON Schema object) or `fields` (a shorthand fields string)."
|
|
312
|
+
);
|
|
313
|
+
}
|
|
314
|
+
if (hasFields) {
|
|
315
|
+
return buildSchemaFromFields(options.fields);
|
|
316
|
+
}
|
|
317
|
+
return options.schema;
|
|
318
|
+
};
|
|
319
|
+
var extract = async (options) => {
|
|
320
|
+
const debug = options.debug;
|
|
321
|
+
let telemetry = options.telemetry;
|
|
322
|
+
if (telemetry) {
|
|
323
|
+
try {
|
|
324
|
+
await telemetry.initialize();
|
|
325
|
+
} catch (error) {
|
|
326
|
+
console.error("Telemetry initialization failed, continuing without telemetry:", error.message);
|
|
327
|
+
telemetry = void 0;
|
|
328
|
+
}
|
|
329
|
+
}
|
|
330
|
+
const rootSpan = telemetry?.startSpan({
|
|
331
|
+
name: "struktur.extract",
|
|
332
|
+
kind: "CHAIN",
|
|
333
|
+
attributes: {
|
|
334
|
+
"extraction.strategy": options.strategy?.name ?? "default",
|
|
335
|
+
"extraction.artifacts.count": options.artifacts.length
|
|
336
|
+
}
|
|
337
|
+
});
|
|
338
|
+
try {
|
|
339
|
+
let resolvedOptions;
|
|
340
|
+
try {
|
|
341
|
+
const schema = resolveSchema(options);
|
|
342
|
+
resolvedOptions = { ...options, schema };
|
|
343
|
+
} catch (error) {
|
|
344
|
+
debug?.extractionComplete({
|
|
345
|
+
success: false,
|
|
346
|
+
totalInputTokens: 0,
|
|
347
|
+
totalOutputTokens: 0,
|
|
348
|
+
totalTokens: 0,
|
|
349
|
+
error: error.message
|
|
350
|
+
});
|
|
351
|
+
safeEndSpan(telemetry, rootSpan, {
|
|
352
|
+
status: "error",
|
|
353
|
+
error
|
|
354
|
+
});
|
|
355
|
+
await safeShutdown(telemetry);
|
|
356
|
+
return {
|
|
357
|
+
data: null,
|
|
358
|
+
usage: emptyUsage,
|
|
359
|
+
error
|
|
360
|
+
};
|
|
361
|
+
}
|
|
362
|
+
const total = resolvedOptions.strategy.getEstimatedSteps?.(resolvedOptions.artifacts);
|
|
363
|
+
debug?.strategyRunStart({
|
|
364
|
+
strategy: resolvedOptions.strategy.name,
|
|
365
|
+
estimatedSteps: total ?? 1,
|
|
366
|
+
artifactCount: resolvedOptions.artifacts.length
|
|
367
|
+
});
|
|
368
|
+
await resolvedOptions.events?.onStep?.({ step: 1, total, label: "start" });
|
|
369
|
+
debug?.step({
|
|
370
|
+
step: 1,
|
|
371
|
+
total,
|
|
372
|
+
label: "start",
|
|
373
|
+
strategy: resolvedOptions.strategy.name
|
|
374
|
+
});
|
|
375
|
+
const result = await resolvedOptions.strategy.run(resolvedOptions);
|
|
376
|
+
await resolvedOptions.events?.onStep?.({
|
|
377
|
+
step: total ?? 1,
|
|
378
|
+
total,
|
|
379
|
+
label: "complete"
|
|
380
|
+
});
|
|
381
|
+
debug?.step({
|
|
382
|
+
step: total ?? 1,
|
|
383
|
+
total,
|
|
384
|
+
label: "complete",
|
|
385
|
+
strategy: resolvedOptions.strategy.name
|
|
386
|
+
});
|
|
387
|
+
debug?.extractionComplete({
|
|
388
|
+
success: !result.error,
|
|
389
|
+
totalInputTokens: result.usage.inputTokens,
|
|
390
|
+
totalOutputTokens: result.usage.outputTokens,
|
|
391
|
+
totalTokens: result.usage.totalTokens,
|
|
392
|
+
error: result.error?.message
|
|
393
|
+
});
|
|
394
|
+
safeEndSpan(telemetry, rootSpan, {
|
|
395
|
+
status: result.error ? "error" : "ok",
|
|
396
|
+
output: result.data,
|
|
397
|
+
error: result.error
|
|
398
|
+
});
|
|
399
|
+
await safeShutdown(telemetry);
|
|
400
|
+
return result;
|
|
401
|
+
} catch (error) {
|
|
402
|
+
debug?.extractionComplete({
|
|
403
|
+
success: false,
|
|
404
|
+
totalInputTokens: 0,
|
|
405
|
+
totalOutputTokens: 0,
|
|
406
|
+
totalTokens: 0,
|
|
407
|
+
error: error.message
|
|
408
|
+
});
|
|
409
|
+
safeEndSpan(telemetry, rootSpan, {
|
|
410
|
+
status: "error",
|
|
411
|
+
error
|
|
412
|
+
});
|
|
413
|
+
await safeShutdown(telemetry);
|
|
414
|
+
return {
|
|
415
|
+
data: null,
|
|
416
|
+
usage: emptyUsage,
|
|
417
|
+
error
|
|
418
|
+
};
|
|
419
|
+
}
|
|
420
|
+
};
|
|
421
|
+
|
|
422
|
+
// src/artifacts/urlToArtifact.ts
|
|
423
|
+
var urlToArtifact = async (url) => {
|
|
424
|
+
const response = await fetch(url);
|
|
425
|
+
if (!response.ok) {
|
|
426
|
+
throw new Error(`Failed to fetch artifact: ${response.status} ${response.statusText}`);
|
|
427
|
+
}
|
|
428
|
+
const data = await response.json();
|
|
429
|
+
return {
|
|
430
|
+
...data,
|
|
431
|
+
raw: data.raw ?? (async () => Buffer.from(JSON.stringify(data.contents ?? [])))
|
|
432
|
+
};
|
|
433
|
+
};
|
|
434
|
+
|
|
435
|
+
// src/artifacts/providers.ts
|
|
436
|
+
var defaultArtifactProviders = {};
|
|
437
|
+
|
|
438
|
+
// src/artifacts/fileToArtifact.ts
|
|
439
|
+
var bufferToTextArtifact = (buffer) => ({
|
|
440
|
+
id: `artifact-${crypto.randomUUID()}`,
|
|
441
|
+
type: "text",
|
|
442
|
+
raw: async () => buffer,
|
|
443
|
+
contents: [{ text: buffer.toString() }]
|
|
444
|
+
});
|
|
445
|
+
var bufferToImageArtifact = (buffer) => ({
|
|
446
|
+
id: `artifact-${crypto.randomUUID()}`,
|
|
447
|
+
type: "image",
|
|
448
|
+
raw: async () => buffer,
|
|
449
|
+
contents: [{ media: [{ type: "image", contents: buffer }] }]
|
|
450
|
+
});
|
|
451
|
+
var fileToArtifact = async (buffer, options) => {
|
|
452
|
+
const providers = options.providers ?? defaultArtifactProviders;
|
|
453
|
+
const provider = providers[options.mimeType];
|
|
454
|
+
if (provider) {
|
|
455
|
+
return provider(buffer);
|
|
456
|
+
}
|
|
457
|
+
if (options.mimeType.startsWith("text/")) {
|
|
458
|
+
return bufferToTextArtifact(buffer);
|
|
459
|
+
}
|
|
460
|
+
if (options.mimeType.startsWith("image/")) {
|
|
461
|
+
return bufferToImageArtifact(buffer);
|
|
462
|
+
}
|
|
463
|
+
throw new Error(
|
|
464
|
+
`No artifact provider registered for ${options.mimeType}`
|
|
465
|
+
);
|
|
466
|
+
};
|
|
467
|
+
|
|
468
|
+
// src/validation/validator.ts
|
|
469
|
+
import Ajv from "ajv";
|
|
470
|
+
import addFormats from "ajv-formats";
|
|
471
|
+
var SchemaValidationError = class extends Error {
|
|
472
|
+
errors;
|
|
473
|
+
constructor(message, errors) {
|
|
474
|
+
super(message);
|
|
475
|
+
this.name = "SchemaValidationError";
|
|
476
|
+
this.errors = errors;
|
|
477
|
+
}
|
|
478
|
+
};
|
|
479
|
+
var ARTIFACT_ID_PATTERN = /^artifact:[^/]+\/images\/image\d+\.\w+$/;
|
|
480
|
+
var createAjv = () => {
|
|
481
|
+
const ajv = new Ajv({
|
|
482
|
+
allErrors: true,
|
|
483
|
+
strict: false,
|
|
484
|
+
allowUnionTypes: true
|
|
485
|
+
});
|
|
486
|
+
addFormats(ajv);
|
|
487
|
+
ajv.addFormat("artifact-id", {
|
|
488
|
+
type: "string",
|
|
489
|
+
validate: (data) => ARTIFACT_ID_PATTERN.test(data)
|
|
490
|
+
});
|
|
491
|
+
return ajv;
|
|
492
|
+
};
|
|
493
|
+
var validateOrThrow = (ajv, schema, data) => {
|
|
494
|
+
const validate = ajv.compile(schema);
|
|
495
|
+
const valid = validate(data);
|
|
496
|
+
if (!valid) {
|
|
497
|
+
const errors = validate.errors ?? [];
|
|
498
|
+
const message = "Schema validation failed";
|
|
499
|
+
throw new SchemaValidationError(message, errors);
|
|
500
|
+
}
|
|
501
|
+
return data;
|
|
502
|
+
};
|
|
503
|
+
var isRequiredError = (error) => {
|
|
504
|
+
return error.keyword === "required";
|
|
505
|
+
};
|
|
506
|
+
var validateAllowingMissingRequired = (ajv, schema, data, isFinalAttempt = true) => {
|
|
507
|
+
const validate = ajv.compile(schema);
|
|
508
|
+
const valid = validate(data);
|
|
509
|
+
if (valid) {
|
|
510
|
+
return { valid: true, data };
|
|
511
|
+
}
|
|
512
|
+
const errors = validate.errors ?? [];
|
|
513
|
+
const nonRequiredErrors = errors.filter((error) => !isRequiredError(error));
|
|
514
|
+
if (nonRequiredErrors.length === 0) {
|
|
515
|
+
if (isFinalAttempt) {
|
|
516
|
+
return { valid: true, data };
|
|
517
|
+
}
|
|
518
|
+
return { valid: false, errors };
|
|
519
|
+
}
|
|
520
|
+
return { valid: false, errors: nonRequiredErrors };
|
|
521
|
+
};
|
|
522
|
+
|
|
523
|
+
// src/parsers/runner.ts
|
|
524
|
+
import os from "os";
|
|
525
|
+
import path from "path";
|
|
526
|
+
import { rm, writeFile, readFile } from "fs/promises";
|
|
527
|
+
import { exec } from "child_process";
|
|
528
|
+
import { promisify } from "util";
|
|
529
|
+
var execAsync = promisify(exec);
|
|
530
|
+
var parseCommandOutput = (stdout) => {
|
|
531
|
+
let parsed;
|
|
532
|
+
try {
|
|
533
|
+
parsed = JSON.parse(stdout);
|
|
534
|
+
} catch (error) {
|
|
535
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
536
|
+
throw new Error(`Parser command produced invalid JSON: ${message}
|
|
537
|
+
Output: ${stdout.slice(0, 200)}`);
|
|
538
|
+
}
|
|
539
|
+
const serialized = validateSerializedArtifacts(parsed);
|
|
540
|
+
return hydrateSerializedArtifacts(serialized);
|
|
541
|
+
};
|
|
542
|
+
var spawnAndCapture = async (command, stdinBuffer) => {
|
|
543
|
+
if (!command.trim()) {
|
|
544
|
+
throw new Error(`Empty command: ${command}`);
|
|
545
|
+
}
|
|
546
|
+
try {
|
|
547
|
+
const options = stdinBuffer ? { input: stdinBuffer.toString(), maxBuffer: 50 * 1024 * 1024 } : { maxBuffer: 50 * 1024 * 1024 };
|
|
548
|
+
const { stdout } = await execAsync(command, options);
|
|
549
|
+
return stdout;
|
|
550
|
+
} catch (error) {
|
|
551
|
+
if (error instanceof Error && "stderr" in error) {
|
|
552
|
+
const stderr = error.stderr;
|
|
553
|
+
throw new Error(
|
|
554
|
+
`Parser command failed: ${command}
|
|
555
|
+
Stderr: ${stderr?.slice(0, 500) ?? ""}`
|
|
556
|
+
);
|
|
557
|
+
}
|
|
558
|
+
throw error;
|
|
559
|
+
}
|
|
560
|
+
};
|
|
561
|
+
var runNpmParser = async (pkg, input, mimeType) => {
|
|
562
|
+
const mod = await import(pkg);
|
|
563
|
+
const hasParseFile = typeof mod.parseFile === "function";
|
|
564
|
+
const hasParseStream = typeof mod.parseStream === "function";
|
|
565
|
+
if (!hasParseFile && !hasParseStream) {
|
|
566
|
+
throw new Error(
|
|
567
|
+
`npm parser package "${pkg}" exports neither parseFile nor parseStream`
|
|
568
|
+
);
|
|
569
|
+
}
|
|
570
|
+
if (input.kind === "file") {
|
|
571
|
+
if (hasParseFile) {
|
|
572
|
+
return mod.parseFile(input.path, mimeType);
|
|
573
|
+
}
|
|
574
|
+
const { createReadStream } = await import("fs");
|
|
575
|
+
const { Readable } = await import("stream");
|
|
576
|
+
const nodeStream = createReadStream(input.path);
|
|
577
|
+
const stream = Readable.toWeb(nodeStream);
|
|
578
|
+
return mod.parseStream(stream, mimeType);
|
|
579
|
+
}
|
|
580
|
+
if (hasParseStream) {
|
|
581
|
+
const stream = new ReadableStream({
|
|
582
|
+
start(controller) {
|
|
583
|
+
controller.enqueue(input.buffer);
|
|
584
|
+
controller.close();
|
|
585
|
+
}
|
|
586
|
+
});
|
|
587
|
+
return mod.parseStream(stream, mimeType);
|
|
588
|
+
}
|
|
589
|
+
const tmpFile = path.join(os.tmpdir(), `struktur-parse-${crypto.randomUUID()}`);
|
|
590
|
+
try {
|
|
591
|
+
await writeFile(tmpFile, input.buffer);
|
|
592
|
+
return await mod.parseFile(tmpFile, mimeType);
|
|
593
|
+
} finally {
|
|
594
|
+
await rm(tmpFile, { force: true });
|
|
595
|
+
}
|
|
596
|
+
};
|
|
597
|
+
var runCommandFileParser = async (command, input) => {
|
|
598
|
+
let filePath;
|
|
599
|
+
let tempFile = null;
|
|
600
|
+
if (input.kind === "file") {
|
|
601
|
+
filePath = input.path;
|
|
602
|
+
} else {
|
|
603
|
+
tempFile = path.join(os.tmpdir(), `struktur-parse-${crypto.randomUUID()}`);
|
|
604
|
+
await writeFile(tempFile, input.buffer);
|
|
605
|
+
filePath = tempFile;
|
|
606
|
+
}
|
|
607
|
+
try {
|
|
608
|
+
const interpolated = command.replace(/FILE_PATH/g, filePath);
|
|
609
|
+
const stdout = await spawnAndCapture(interpolated);
|
|
610
|
+
return parseCommandOutput(stdout);
|
|
611
|
+
} finally {
|
|
612
|
+
if (tempFile) {
|
|
613
|
+
await rm(tempFile, { force: true });
|
|
614
|
+
}
|
|
615
|
+
}
|
|
616
|
+
};
|
|
617
|
+
var runCommandStdinParser = async (command, input) => {
|
|
618
|
+
let buffer;
|
|
619
|
+
if (input.kind === "file") {
|
|
620
|
+
buffer = await readFile(input.path);
|
|
621
|
+
} else {
|
|
622
|
+
buffer = input.buffer;
|
|
623
|
+
}
|
|
624
|
+
const stdout = await spawnAndCapture(command, buffer);
|
|
625
|
+
return parseCommandOutput(stdout);
|
|
626
|
+
};
|
|
627
|
+
var runParser = async (def, input, mimeType) => {
|
|
628
|
+
switch (def.type) {
|
|
629
|
+
case "npm":
|
|
630
|
+
return runNpmParser(def.package, input, mimeType);
|
|
631
|
+
case "command-file":
|
|
632
|
+
return runCommandFileParser(def.command, input);
|
|
633
|
+
case "command-stdin":
|
|
634
|
+
return runCommandStdinParser(def.command, input);
|
|
635
|
+
case "inline": {
|
|
636
|
+
let buffer;
|
|
637
|
+
if (input.kind === "file") {
|
|
638
|
+
buffer = await readFile(input.path);
|
|
639
|
+
} else {
|
|
640
|
+
buffer = input.buffer;
|
|
641
|
+
}
|
|
642
|
+
return [await def.handler(buffer)];
|
|
643
|
+
}
|
|
644
|
+
default: {
|
|
645
|
+
const _exhaustive = def;
|
|
646
|
+
throw new Error(`Unknown parser type: ${_exhaustive.type}`);
|
|
647
|
+
}
|
|
648
|
+
}
|
|
649
|
+
};
|
|
650
|
+
|
|
651
|
+
// src/artifacts/input.ts
|
|
652
|
+
import { readFile as readFile2 } from "fs/promises";
|
|
653
|
+
|
|
654
|
+
// src/parsers/mime.ts
|
|
655
|
+
import path2 from "path";
|
|
656
|
+
var MAGIC_BYTES = [
|
|
657
|
+
// PDF: %PDF
|
|
658
|
+
{ mimeType: "application/pdf", bytes: [37, 80, 68, 70] },
|
|
659
|
+
// PNG: 89 50 4E 47
|
|
660
|
+
{ mimeType: "image/png", bytes: [137, 80, 78, 71] },
|
|
661
|
+
// JPEG: FF D8 FF
|
|
662
|
+
{ mimeType: "image/jpeg", bytes: [255, 216, 255] },
|
|
663
|
+
// GIF: GIF8
|
|
664
|
+
{ mimeType: "image/gif", bytes: [71, 73, 70, 56] },
|
|
665
|
+
// ZIP / Office Open XML (DOCX/XLSX/PPTX all start with PK\x03\x04)
|
|
666
|
+
{
|
|
667
|
+
mimeType: "application/zip",
|
|
668
|
+
bytes: [80, 75, 3, 4]
|
|
669
|
+
}
|
|
670
|
+
];
|
|
671
|
+
var isWebP = (header) => {
|
|
672
|
+
if (header.length < 12) return false;
|
|
673
|
+
const riff = header[0] === 82 && header[1] === 73 && header[2] === 70 && header[3] === 70;
|
|
674
|
+
const webp = header[8] === 87 && header[9] === 69 && header[10] === 66 && header[11] === 80;
|
|
675
|
+
return riff && webp;
|
|
676
|
+
};
|
|
677
|
+
var matchesMagicBytes = (header, bytes, offset = 0) => {
|
|
678
|
+
if (header.length < offset + bytes.length) return false;
|
|
679
|
+
return bytes.every((b, i) => header[offset + i] === b);
|
|
680
|
+
};
|
|
681
|
+
var detectFromMagicBytes = (header) => {
|
|
682
|
+
if (isWebP(header)) return "image/webp";
|
|
683
|
+
for (const { mimeType, bytes, offset } of MAGIC_BYTES) {
|
|
684
|
+
if (matchesMagicBytes(header, bytes, offset ?? 0)) {
|
|
685
|
+
return mimeType;
|
|
686
|
+
}
|
|
687
|
+
}
|
|
688
|
+
return null;
|
|
689
|
+
};
|
|
690
|
+
var EXTENSION_MIME_MAP = {
|
|
691
|
+
".txt": "text/plain",
|
|
692
|
+
".md": "text/markdown",
|
|
693
|
+
".markdown": "text/markdown",
|
|
694
|
+
".html": "text/html",
|
|
695
|
+
".htm": "text/html",
|
|
696
|
+
".json": "application/json",
|
|
697
|
+
".pdf": "application/pdf",
|
|
698
|
+
".png": "image/png",
|
|
699
|
+
".jpg": "image/jpeg",
|
|
700
|
+
".jpeg": "image/jpeg",
|
|
701
|
+
".gif": "image/gif",
|
|
702
|
+
".webp": "image/webp",
|
|
703
|
+
".csv": "text/csv",
|
|
704
|
+
".xml": "application/xml",
|
|
705
|
+
".yaml": "application/yaml",
|
|
706
|
+
".yml": "application/yaml",
|
|
707
|
+
".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
|
708
|
+
".xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
|
709
|
+
".pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
|
710
|
+
".mp4": "video/mp4",
|
|
711
|
+
".mp3": "audio/mpeg",
|
|
712
|
+
".wav": "audio/wav",
|
|
713
|
+
".ogg": "audio/ogg",
|
|
714
|
+
".svg": "image/svg+xml",
|
|
715
|
+
".ts": "text/plain",
|
|
716
|
+
".tsx": "text/plain",
|
|
717
|
+
".js": "text/javascript",
|
|
718
|
+
".jsx": "text/javascript",
|
|
719
|
+
".css": "text/css",
|
|
720
|
+
".toml": "application/toml"
|
|
721
|
+
};
|
|
722
|
+
async function detectMimeType(options) {
|
|
723
|
+
const { buffer, filePath, mimeOverride, npmParsers } = options;
|
|
724
|
+
if (mimeOverride) {
|
|
725
|
+
return mimeOverride;
|
|
726
|
+
}
|
|
727
|
+
if (buffer && buffer.length > 0) {
|
|
728
|
+
const header = buffer.subarray(0, 512);
|
|
729
|
+
const magicMime = detectFromMagicBytes(header);
|
|
730
|
+
if (magicMime) {
|
|
731
|
+
return magicMime;
|
|
732
|
+
}
|
|
733
|
+
if (npmParsers && npmParsers.length > 0) {
|
|
734
|
+
for (const entry of npmParsers) {
|
|
735
|
+
try {
|
|
736
|
+
const mod = await import(entry.def.package);
|
|
737
|
+
if (typeof mod.detectFileType === "function" && mod.detectFileType(header)) {
|
|
738
|
+
return entry.mimeType;
|
|
739
|
+
}
|
|
740
|
+
} catch {
|
|
741
|
+
}
|
|
742
|
+
}
|
|
743
|
+
}
|
|
744
|
+
}
|
|
745
|
+
if (filePath) {
|
|
746
|
+
const ext = path2.extname(filePath).toLowerCase();
|
|
747
|
+
if (ext && ext in EXTENSION_MIME_MAP) {
|
|
748
|
+
return EXTENSION_MIME_MAP[ext] ?? null;
|
|
749
|
+
}
|
|
750
|
+
}
|
|
751
|
+
return null;
|
|
752
|
+
}
|
|
753
|
+
|
|
754
|
+
// src/artifacts/input.ts
|
|
755
|
+
var serializedArtifactImageSchema = {
|
|
756
|
+
type: "object",
|
|
757
|
+
required: ["type"],
|
|
758
|
+
properties: {
|
|
759
|
+
type: { const: "image" },
|
|
760
|
+
url: { type: "string", minLength: 1 },
|
|
761
|
+
base64: { type: "string", minLength: 1 },
|
|
762
|
+
text: { type: "string" },
|
|
763
|
+
x: { type: "number" },
|
|
764
|
+
y: { type: "number" },
|
|
765
|
+
width: { type: "number" },
|
|
766
|
+
height: { type: "number" },
|
|
767
|
+
imageType: { enum: ["embedded", "screenshot"] }
|
|
768
|
+
},
|
|
769
|
+
additionalProperties: false,
|
|
770
|
+
anyOf: [{ required: ["url"] }, { required: ["base64"] }]
|
|
771
|
+
};
|
|
772
|
+
var serializedArtifactContentSchema = {
|
|
773
|
+
type: "object",
|
|
774
|
+
properties: {
|
|
775
|
+
page: { type: "number" },
|
|
776
|
+
text: { type: "string" },
|
|
777
|
+
media: { type: "array", items: serializedArtifactImageSchema }
|
|
778
|
+
},
|
|
779
|
+
additionalProperties: false,
|
|
780
|
+
anyOf: [{ required: ["text"] }, { required: ["media"] }]
|
|
781
|
+
};
|
|
782
|
+
var serializedArtifactSchema = {
|
|
783
|
+
type: "object",
|
|
784
|
+
required: ["id", "type", "contents"],
|
|
785
|
+
properties: {
|
|
786
|
+
id: { type: "string", minLength: 1 },
|
|
787
|
+
type: { enum: ["text", "image", "pdf", "file"] },
|
|
788
|
+
contents: { type: "array", items: serializedArtifactContentSchema },
|
|
789
|
+
metadata: { type: "object", additionalProperties: true },
|
|
790
|
+
tokens: { type: "number" }
|
|
791
|
+
},
|
|
792
|
+
additionalProperties: false
|
|
793
|
+
};
|
|
794
|
+
var serializedArtifactsSchema = {
|
|
795
|
+
anyOf: [
|
|
796
|
+
serializedArtifactSchema,
|
|
797
|
+
{ type: "array", items: serializedArtifactSchema }
|
|
798
|
+
]
|
|
799
|
+
};
|
|
800
|
+
var inputParsers = [];
|
|
801
|
+
var registerArtifactInputParser = (parser) => {
|
|
802
|
+
inputParsers.push(parser);
|
|
803
|
+
};
|
|
804
|
+
var clearArtifactInputParsers = () => {
|
|
805
|
+
inputParsers.length = 0;
|
|
806
|
+
};
|
|
807
|
+
var validateSerializedArtifacts = (data) => {
|
|
808
|
+
const ajv = createAjv();
|
|
809
|
+
const parsed = validateOrThrow(
|
|
810
|
+
ajv,
|
|
811
|
+
serializedArtifactsSchema,
|
|
812
|
+
data
|
|
813
|
+
);
|
|
814
|
+
return Array.isArray(parsed) ? parsed : [parsed];
|
|
815
|
+
};
|
|
816
|
+
var hydrateSerializedArtifacts = (items) => {
|
|
817
|
+
return items.map((item) => ({
|
|
818
|
+
...item,
|
|
819
|
+
raw: async () => Buffer.from(JSON.stringify(item.contents ?? []))
|
|
820
|
+
}));
|
|
821
|
+
};
|
|
822
|
+
var parseSerializedArtifacts = (text) => {
|
|
823
|
+
const parsed = JSON.parse(text);
|
|
824
|
+
return validateSerializedArtifacts(parsed);
|
|
825
|
+
};
|
|
826
|
+
var splitTextIntoContents = (text) => {
|
|
827
|
+
const blocks = text.split(/\n\s*\n/g).map((block) => block.trim()).filter((block) => block.length > 0);
|
|
828
|
+
if (blocks.length === 0) {
|
|
829
|
+
return [{ text }];
|
|
830
|
+
}
|
|
831
|
+
return blocks.map((block) => ({ text: block }));
|
|
832
|
+
};
|
|
833
|
+
var bufferToTextArtifact2 = (buffer, id) => {
|
|
834
|
+
const text = buffer.toString();
|
|
835
|
+
return {
|
|
836
|
+
id: id ?? `artifact-${crypto.randomUUID()}`,
|
|
837
|
+
type: "text",
|
|
838
|
+
raw: async () => buffer,
|
|
839
|
+
contents: splitTextIntoContents(text)
|
|
840
|
+
};
|
|
841
|
+
};
|
|
842
|
+
var bufferToImageArtifact2 = (buffer, id) => {
|
|
843
|
+
return {
|
|
844
|
+
id: id ?? `artifact-${crypto.randomUUID()}`,
|
|
845
|
+
type: "image",
|
|
846
|
+
raw: async () => buffer,
|
|
847
|
+
contents: [
|
|
848
|
+
{
|
|
849
|
+
media: [{ type: "image", contents: buffer }]
|
|
850
|
+
}
|
|
851
|
+
]
|
|
852
|
+
};
|
|
853
|
+
};
|
|
854
|
+
var parseBufferInput = async (buffer, mimeType, id, providers, parsers, includeImages, screenshots, screenshotScale, screenshotWidth) => {
|
|
855
|
+
if (parsers) {
|
|
856
|
+
const parserDef = parsers[mimeType];
|
|
857
|
+
if (parserDef) {
|
|
858
|
+
return runParser(parserDef, { kind: "buffer", buffer }, mimeType);
|
|
859
|
+
}
|
|
860
|
+
}
|
|
861
|
+
const registry = providers ?? defaultArtifactProviders;
|
|
862
|
+
const provider = registry[mimeType];
|
|
863
|
+
if (provider) {
|
|
864
|
+
return [await provider(buffer)];
|
|
865
|
+
}
|
|
866
|
+
if (mimeType === "application/json") {
|
|
867
|
+
try {
|
|
868
|
+
const parsed = JSON.parse(buffer.toString());
|
|
869
|
+
const serialized = validateSerializedArtifacts(parsed);
|
|
870
|
+
return hydrateSerializedArtifacts(serialized);
|
|
871
|
+
} catch {
|
|
872
|
+
throw new Error(
|
|
873
|
+
"Input is JSON but not in SerializedArtifact format. To parse arbitrary JSON files, configure a parser: struktur config parsers add --mime application/json ..."
|
|
874
|
+
);
|
|
875
|
+
}
|
|
876
|
+
}
|
|
877
|
+
if (mimeType === "application/pdf") {
|
|
878
|
+
const { parsePdf: parsePdf2 } = await Promise.resolve().then(() => (init_pdf(), pdf_exports));
|
|
879
|
+
const pdfOptions = {
|
|
880
|
+
includeImages,
|
|
881
|
+
screenshots,
|
|
882
|
+
screenshotScale,
|
|
883
|
+
screenshotWidth
|
|
884
|
+
};
|
|
885
|
+
return [await parsePdf2(buffer, pdfOptions)];
|
|
886
|
+
}
|
|
887
|
+
if (mimeType.startsWith("text/")) {
|
|
888
|
+
return [bufferToTextArtifact2(buffer, id)];
|
|
889
|
+
}
|
|
890
|
+
if (mimeType.startsWith("image/")) {
|
|
891
|
+
return [bufferToImageArtifact2(buffer, id)];
|
|
892
|
+
}
|
|
893
|
+
throw new Error(`Unsupported MIME type: ${mimeType}`);
|
|
894
|
+
};
|
|
895
|
+
var artifactJsonParser = {
|
|
896
|
+
name: "artifact-json",
|
|
897
|
+
canParse: (input) => input.kind === "artifact-json",
|
|
898
|
+
parse: async (input) => {
|
|
899
|
+
if (input.kind !== "artifact-json") {
|
|
900
|
+
return [];
|
|
901
|
+
}
|
|
902
|
+
const serialized = validateSerializedArtifacts(input.data);
|
|
903
|
+
return hydrateSerializedArtifacts(serialized);
|
|
904
|
+
}
|
|
905
|
+
};
|
|
906
|
+
var textParser = {
|
|
907
|
+
name: "text",
|
|
908
|
+
canParse: (input) => input.kind === "text",
|
|
909
|
+
parse: async (input) => {
|
|
910
|
+
if (input.kind !== "text") {
|
|
911
|
+
return [];
|
|
912
|
+
}
|
|
913
|
+
const buffer = Buffer.from(input.text);
|
|
914
|
+
return [bufferToTextArtifact2(buffer, input.id)];
|
|
915
|
+
}
|
|
916
|
+
};
|
|
917
|
+
var fileParser = {
|
|
918
|
+
name: "file",
|
|
919
|
+
canParse: (input) => input.kind === "file",
|
|
920
|
+
parse: async (input, options) => {
|
|
921
|
+
if (input.kind !== "file") {
|
|
922
|
+
return [];
|
|
923
|
+
}
|
|
924
|
+
const mimeType = input.mimeType ?? await detectMimeType({ filePath: input.path }) ?? "application/octet-stream";
|
|
925
|
+
if (mimeType === "application/json") {
|
|
926
|
+
const text = await readFile2(input.path, "utf-8");
|
|
927
|
+
try {
|
|
928
|
+
const parsed = JSON.parse(text);
|
|
929
|
+
const serialized = validateSerializedArtifacts(parsed);
|
|
930
|
+
return hydrateSerializedArtifacts(serialized);
|
|
931
|
+
} catch {
|
|
932
|
+
if (options?.parsers) {
|
|
933
|
+
const parserDef = options.parsers[mimeType];
|
|
934
|
+
if (parserDef) {
|
|
935
|
+
return runParser(parserDef, { kind: "file", path: input.path }, mimeType);
|
|
936
|
+
}
|
|
937
|
+
}
|
|
938
|
+
throw new Error(
|
|
939
|
+
`File "${input.path}" is JSON but not in SerializedArtifact format. To parse arbitrary JSON files, configure a parser: struktur config parsers add --mime application/json ...`
|
|
940
|
+
);
|
|
941
|
+
}
|
|
942
|
+
}
|
|
943
|
+
const buffer = await readFile2(input.path);
|
|
944
|
+
return parseBufferInput(
|
|
945
|
+
buffer,
|
|
946
|
+
mimeType,
|
|
947
|
+
input.id,
|
|
948
|
+
options?.providers,
|
|
949
|
+
options?.parsers,
|
|
950
|
+
options?.includeImages,
|
|
951
|
+
options?.screenshots,
|
|
952
|
+
options?.screenshotScale,
|
|
953
|
+
options?.screenshotWidth
|
|
954
|
+
);
|
|
955
|
+
}
|
|
956
|
+
};
|
|
957
|
+
var bufferParser = {
|
|
958
|
+
name: "buffer",
|
|
959
|
+
canParse: (input) => input.kind === "buffer",
|
|
960
|
+
parse: async (input, options) => {
|
|
961
|
+
if (input.kind !== "buffer") {
|
|
962
|
+
return [];
|
|
963
|
+
}
|
|
964
|
+
return parseBufferInput(
|
|
965
|
+
input.buffer,
|
|
966
|
+
input.mimeType,
|
|
967
|
+
input.id,
|
|
968
|
+
options?.providers,
|
|
969
|
+
options?.parsers,
|
|
970
|
+
options?.includeImages,
|
|
971
|
+
options?.screenshots,
|
|
972
|
+
options?.screenshotScale,
|
|
973
|
+
options?.screenshotWidth
|
|
974
|
+
);
|
|
975
|
+
}
|
|
976
|
+
};
|
|
977
|
+
var parse = async (input, options) => {
|
|
978
|
+
const parsers = options?.parsers ?? [
|
|
979
|
+
...inputParsers,
|
|
980
|
+
artifactJsonParser,
|
|
981
|
+
textParser,
|
|
982
|
+
fileParser,
|
|
983
|
+
bufferParser
|
|
984
|
+
];
|
|
985
|
+
const parser = parsers.find((candidate) => candidate.canParse(input));
|
|
986
|
+
if (!parser) {
|
|
987
|
+
throw new Error(`No artifact input parser available for ${input.kind}`);
|
|
988
|
+
}
|
|
989
|
+
return parser.parse(input, {
|
|
990
|
+
providers: options?.providers,
|
|
991
|
+
parsers: options?.parserConfig,
|
|
992
|
+
includeImages: options?.includeImages,
|
|
993
|
+
screenshots: options?.screenshots,
|
|
994
|
+
screenshotScale: options?.screenshotScale,
|
|
995
|
+
screenshotWidth: options?.screenshotWidth
|
|
996
|
+
});
|
|
997
|
+
};
|
|
998
|
+
|
|
999
|
+
// src/prompts/formatArtifacts.ts
|
|
1000
|
+
var imageRefFor = (artifactId, index, image) => {
|
|
1001
|
+
if (image.url) {
|
|
1002
|
+
return image.url;
|
|
1003
|
+
}
|
|
1004
|
+
const extension = image.base64 ? "png" : "bin";
|
|
1005
|
+
return `artifact:${artifactId}/images/image${index + 1}.${extension}`;
|
|
1006
|
+
};
|
|
1007
|
+
var escapeXml = (value) => {
|
|
1008
|
+
return value.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/\"/g, """).replace(/'/g, "'");
|
|
1009
|
+
};
|
|
1010
|
+
var formatArtifactsXml = (artifacts) => {
|
|
1011
|
+
const parts = [];
|
|
1012
|
+
for (const artifact of artifacts) {
|
|
1013
|
+
parts.push(`<artifact id="${escapeXml(artifact.id)}" type="${artifact.type}">`);
|
|
1014
|
+
for (const content of artifact.contents) {
|
|
1015
|
+
if (content.text) {
|
|
1016
|
+
const pageAttr = content.page !== void 0 ? ` page="${content.page}"` : "";
|
|
1017
|
+
parts.push(` <text${pageAttr}>${escapeXml(content.text)}</text>`);
|
|
1018
|
+
}
|
|
1019
|
+
if (content.media?.length) {
|
|
1020
|
+
content.media.forEach((media, index) => {
|
|
1021
|
+
const ref = imageRefFor(artifact.id, index, media);
|
|
1022
|
+
const pageAttr = content.page !== void 0 ? ` page="${content.page}"` : "";
|
|
1023
|
+
parts.push(` <image ref="${escapeXml(ref)}"${pageAttr} />`);
|
|
1024
|
+
});
|
|
1025
|
+
}
|
|
1026
|
+
}
|
|
1027
|
+
parts.push("</artifact>");
|
|
1028
|
+
}
|
|
1029
|
+
return parts.join("\n");
|
|
1030
|
+
};
|
|
1031
|
+
|
|
1032
|
+
// src/prompts/ExtractorPrompt.ts
|
|
1033
|
+
var extractorSystemPrompt = (schema, outputInstructions) => {
|
|
1034
|
+
return `<instructions>
|
|
1035
|
+
You are a precise data extraction engine. Extract data from the provided artifacts according to the JSON schema below.
|
|
1036
|
+
|
|
1037
|
+
<thinking>
|
|
1038
|
+
Before extracting, consider:
|
|
1039
|
+
1. Which schema fields have clear values in the artifacts?
|
|
1040
|
+
2. Which fields are missing or unclear (set these to null)?
|
|
1041
|
+
3. For text fields, rewrite concisely while preserving all information
|
|
1042
|
+
4. Ensure no data is lost - include everything that fits the schema
|
|
1043
|
+
</thinking>
|
|
1044
|
+
|
|
1045
|
+
<rules>
|
|
1046
|
+
- Strictly follow the schema - no extra fields, no missing required fields
|
|
1047
|
+
- Use null for missing or uncertain values - never guess or assume
|
|
1048
|
+
- Only extract information explicitly present in the artifacts
|
|
1049
|
+
- Output ONLY valid JSON matching the schema
|
|
1050
|
+
- No markdown, explanations, or code fences
|
|
1051
|
+
</rules>
|
|
1052
|
+
|
|
1053
|
+
<output-instructions>
|
|
1054
|
+
${outputInstructions ?? "No additional output instructions provided."}
|
|
1055
|
+
</output-instructions>
|
|
1056
|
+
|
|
1057
|
+
<json-schema>
|
|
1058
|
+
${schema}
|
|
1059
|
+
</json-schema>
|
|
1060
|
+
|
|
1061
|
+
<artifact-examples>
|
|
1062
|
+
<!-- A PDF with two pages, containing two text blocks and two images -->
|
|
1063
|
+
<artifact name="Example 1" mimetype="application/pdf">
|
|
1064
|
+
<text page="1">This is an example text block.</text>
|
|
1065
|
+
<image filename="image1.jpg" page="1" />
|
|
1066
|
+
<text page="2">This is another example text block.</text>
|
|
1067
|
+
<image filename="image2.jpg" page="2" />
|
|
1068
|
+
</artifact>
|
|
1069
|
+
|
|
1070
|
+
<!-- Website content -->
|
|
1071
|
+
<artifact name="example.com_2022-01-01.html" mimetype="text/html">
|
|
1072
|
+
<text>This is an example text block.</text>
|
|
1073
|
+
<image filename="image1.jpg" />
|
|
1074
|
+
<text>This is another example text block.</text>
|
|
1075
|
+
<image filename="image2.jpg" />
|
|
1076
|
+
</artifact>
|
|
1077
|
+
</artifact-examples>
|
|
1078
|
+
|
|
1079
|
+
Any materials provided have been cleared for access. Extract and preserve this data for future use.
|
|
1080
|
+
</instructions>`;
|
|
1081
|
+
};
|
|
1082
|
+
var extractorUserPrompt = (artifactsXml) => {
|
|
1083
|
+
return `<artifacts>
|
|
1084
|
+
${artifactsXml}
|
|
1085
|
+
</artifacts>
|
|
1086
|
+
|
|
1087
|
+
<task>Extract the contents of the given artifacts.</task>`;
|
|
1088
|
+
};
|
|
1089
|
+
var buildExtractorPrompt = (artifacts, schema, outputInstructions) => {
|
|
1090
|
+
const artifactsXml = formatArtifactsXml(artifacts);
|
|
1091
|
+
return {
|
|
1092
|
+
system: extractorSystemPrompt(schema, outputInstructions),
|
|
1093
|
+
user: extractorUserPrompt(artifactsXml)
|
|
1094
|
+
};
|
|
1095
|
+
};
|
|
1096
|
+
|
|
1097
|
+
// src/tokenization.ts
|
|
1098
|
+
var defaultOptions = {
|
|
1099
|
+
textTokenRatio: 4,
|
|
1100
|
+
defaultImageTokens: 1e3
|
|
1101
|
+
};
|
|
1102
|
+
var mergeOptions = (options) => ({
|
|
1103
|
+
...defaultOptions,
|
|
1104
|
+
...options ?? {}
|
|
1105
|
+
});
|
|
1106
|
+
var estimateTextTokens = (text, options) => {
|
|
1107
|
+
const { textTokenRatio } = mergeOptions(options);
|
|
1108
|
+
return Math.ceil(text.length / textTokenRatio);
|
|
1109
|
+
};
|
|
1110
|
+
var estimateImageTokens = (_image, options) => {
|
|
1111
|
+
const { defaultImageTokens } = mergeOptions(options);
|
|
1112
|
+
return defaultImageTokens;
|
|
1113
|
+
};
|
|
1114
|
+
var countContentTokens = (content, options) => {
|
|
1115
|
+
let tokens = 0;
|
|
1116
|
+
if (content.text) {
|
|
1117
|
+
tokens += estimateTextTokens(content.text, options);
|
|
1118
|
+
}
|
|
1119
|
+
if (content.media?.length) {
|
|
1120
|
+
for (const media of content.media) {
|
|
1121
|
+
tokens += estimateImageTokens(media, options);
|
|
1122
|
+
if (media.text) {
|
|
1123
|
+
tokens += estimateTextTokens(media.text, options);
|
|
1124
|
+
}
|
|
1125
|
+
}
|
|
1126
|
+
}
|
|
1127
|
+
return tokens;
|
|
1128
|
+
};
|
|
1129
|
+
var countArtifactTokens = (artifact, options) => {
|
|
1130
|
+
if (typeof artifact.tokens === "number") {
|
|
1131
|
+
return artifact.tokens;
|
|
1132
|
+
}
|
|
1133
|
+
return artifact.contents.reduce(
|
|
1134
|
+
(total, content) => total + countContentTokens(content, options),
|
|
1135
|
+
0
|
|
1136
|
+
);
|
|
1137
|
+
};
|
|
1138
|
+
var countArtifactImages = (artifact) => {
|
|
1139
|
+
return artifact.contents.reduce((count, content) => {
|
|
1140
|
+
return count + (content.media?.length ?? 0);
|
|
1141
|
+
}, 0);
|
|
1142
|
+
};
|
|
1143
|
+
|
|
1144
|
+
// src/chunking/ArtifactSplitter.ts
|
|
1145
|
+
var splitTextIntoChunks = (content, maxTokens, options, debug, artifactId) => {
|
|
1146
|
+
if (!content.text) {
|
|
1147
|
+
return [content];
|
|
1148
|
+
}
|
|
1149
|
+
const totalTokens = estimateTextTokens(content.text, options);
|
|
1150
|
+
if (totalTokens <= maxTokens) {
|
|
1151
|
+
return [content];
|
|
1152
|
+
}
|
|
1153
|
+
const ratio = options?.textTokenRatio ?? 4;
|
|
1154
|
+
const chunkSize = Math.max(1, maxTokens * ratio);
|
|
1155
|
+
const chunks = [];
|
|
1156
|
+
if (debug && artifactId) {
|
|
1157
|
+
debug.chunkingSplit({
|
|
1158
|
+
artifactId,
|
|
1159
|
+
originalContentCount: 1,
|
|
1160
|
+
splitContentCount: Math.ceil(content.text.length / chunkSize),
|
|
1161
|
+
splitReason: "text_too_long",
|
|
1162
|
+
originalTokens: totalTokens,
|
|
1163
|
+
chunkSize
|
|
1164
|
+
});
|
|
1165
|
+
}
|
|
1166
|
+
for (let offset = 0; offset < content.text.length; offset += chunkSize) {
|
|
1167
|
+
const text = content.text.slice(offset, offset + chunkSize);
|
|
1168
|
+
chunks.push({
|
|
1169
|
+
page: content.page,
|
|
1170
|
+
text,
|
|
1171
|
+
media: offset === 0 ? content.media : void 0
|
|
1172
|
+
});
|
|
1173
|
+
}
|
|
1174
|
+
return chunks;
|
|
1175
|
+
};
|
|
1176
|
+
var splitArtifact = (artifact, options) => {
|
|
1177
|
+
const { maxTokens, maxImages, debug } = options;
|
|
1178
|
+
const splitContents = [];
|
|
1179
|
+
const totalTokens = countArtifactTokens(artifact, options);
|
|
1180
|
+
debug?.chunkingStart({
|
|
1181
|
+
artifactId: artifact.id,
|
|
1182
|
+
totalTokens,
|
|
1183
|
+
maxTokens,
|
|
1184
|
+
maxImages
|
|
1185
|
+
});
|
|
1186
|
+
for (const content of artifact.contents) {
|
|
1187
|
+
splitContents.push(...splitTextIntoChunks(content, maxTokens, options, debug, artifact.id));
|
|
1188
|
+
}
|
|
1189
|
+
const chunks = [];
|
|
1190
|
+
let currentContents = [];
|
|
1191
|
+
let currentTokens = 0;
|
|
1192
|
+
let currentImages = 0;
|
|
1193
|
+
for (const content of splitContents) {
|
|
1194
|
+
const contentTokens = countContentTokens(content, options);
|
|
1195
|
+
const contentImages = content.media?.length ?? 0;
|
|
1196
|
+
const exceedsTokens = currentContents.length > 0 && currentTokens + contentTokens > maxTokens;
|
|
1197
|
+
const exceedsImages = maxImages !== void 0 && currentContents.length > 0 && currentImages + contentImages > maxImages;
|
|
1198
|
+
if (exceedsTokens || exceedsImages) {
|
|
1199
|
+
if (debug) {
|
|
1200
|
+
debug.chunkingSplit({
|
|
1201
|
+
artifactId: artifact.id,
|
|
1202
|
+
originalContentCount: splitContents.length,
|
|
1203
|
+
splitContentCount: chunks.length + 1,
|
|
1204
|
+
splitReason: exceedsTokens ? "content_limit" : "content_limit",
|
|
1205
|
+
originalTokens: totalTokens,
|
|
1206
|
+
chunkSize: maxTokens
|
|
1207
|
+
});
|
|
1208
|
+
}
|
|
1209
|
+
chunks.push({
|
|
1210
|
+
...artifact,
|
|
1211
|
+
id: `${artifact.id}:part:${chunks.length + 1}`,
|
|
1212
|
+
contents: currentContents,
|
|
1213
|
+
tokens: currentTokens
|
|
1214
|
+
});
|
|
1215
|
+
currentContents = [];
|
|
1216
|
+
currentTokens = 0;
|
|
1217
|
+
currentImages = 0;
|
|
1218
|
+
}
|
|
1219
|
+
currentContents.push(content);
|
|
1220
|
+
currentTokens += contentTokens;
|
|
1221
|
+
currentImages += contentImages;
|
|
1222
|
+
}
|
|
1223
|
+
if (currentContents.length > 0) {
|
|
1224
|
+
chunks.push({
|
|
1225
|
+
...artifact,
|
|
1226
|
+
id: `${artifact.id}:part:${chunks.length + 1}`,
|
|
1227
|
+
contents: currentContents,
|
|
1228
|
+
tokens: currentTokens
|
|
1229
|
+
});
|
|
1230
|
+
}
|
|
1231
|
+
if (chunks.length === 0) {
|
|
1232
|
+
chunks.push({
|
|
1233
|
+
...artifact,
|
|
1234
|
+
id: `${artifact.id}:part:1`,
|
|
1235
|
+
tokens: countArtifactTokens(artifact, options)
|
|
1236
|
+
});
|
|
1237
|
+
}
|
|
1238
|
+
debug?.chunkingResult({
|
|
1239
|
+
artifactId: artifact.id,
|
|
1240
|
+
chunksCreated: chunks.length,
|
|
1241
|
+
chunkSizes: chunks.map((c) => c.tokens ?? 0)
|
|
1242
|
+
});
|
|
1243
|
+
return chunks;
|
|
1244
|
+
};
|
|
1245
|
+
|
|
1246
|
+
// src/chunking/ArtifactBatcher.ts
|
|
1247
|
+
var batchArtifacts = (artifacts, options) => {
|
|
1248
|
+
const debug = options.debug;
|
|
1249
|
+
const maxTokens = options.modelMaxTokens ? Math.min(options.maxTokens, options.modelMaxTokens) : options.maxTokens;
|
|
1250
|
+
debug?.batchingStart({
|
|
1251
|
+
totalArtifacts: artifacts.length,
|
|
1252
|
+
maxTokens: options.maxTokens,
|
|
1253
|
+
maxImages: options.maxImages,
|
|
1254
|
+
modelMaxTokens: options.modelMaxTokens,
|
|
1255
|
+
effectiveMaxTokens: maxTokens
|
|
1256
|
+
});
|
|
1257
|
+
const batches = [];
|
|
1258
|
+
let currentBatch = [];
|
|
1259
|
+
let currentTokens = 0;
|
|
1260
|
+
let currentImages = 0;
|
|
1261
|
+
for (const artifact of artifacts) {
|
|
1262
|
+
const splitOptions = {
|
|
1263
|
+
maxTokens,
|
|
1264
|
+
debug
|
|
1265
|
+
};
|
|
1266
|
+
if (options.maxImages !== void 0) splitOptions.maxImages = options.maxImages;
|
|
1267
|
+
if (options.textTokenRatio !== void 0) splitOptions.textTokenRatio = options.textTokenRatio;
|
|
1268
|
+
if (options.defaultImageTokens !== void 0) splitOptions.defaultImageTokens = options.defaultImageTokens;
|
|
1269
|
+
const splits = splitArtifact(artifact, splitOptions);
|
|
1270
|
+
for (const split of splits) {
|
|
1271
|
+
const splitTokens = countArtifactTokens(split, options);
|
|
1272
|
+
const splitImages = countArtifactImages(split);
|
|
1273
|
+
const exceedsTokens = currentBatch.length > 0 && currentTokens + splitTokens > maxTokens;
|
|
1274
|
+
const exceedsImages = options.maxImages !== void 0 && currentBatch.length > 0 && currentImages + splitImages > options.maxImages;
|
|
1275
|
+
if (exceedsTokens || exceedsImages) {
|
|
1276
|
+
debug?.batchCreated({
|
|
1277
|
+
batchIndex: batches.length,
|
|
1278
|
+
artifactCount: currentBatch.length,
|
|
1279
|
+
totalTokens: currentTokens,
|
|
1280
|
+
totalImages: currentImages,
|
|
1281
|
+
artifactIds: currentBatch.map((a) => a.id)
|
|
1282
|
+
});
|
|
1283
|
+
batches.push(currentBatch);
|
|
1284
|
+
currentBatch = [];
|
|
1285
|
+
currentTokens = 0;
|
|
1286
|
+
currentImages = 0;
|
|
1287
|
+
}
|
|
1288
|
+
currentBatch.push(split);
|
|
1289
|
+
currentTokens += splitTokens;
|
|
1290
|
+
currentImages += splitImages;
|
|
1291
|
+
}
|
|
1292
|
+
}
|
|
1293
|
+
if (currentBatch.length > 0) {
|
|
1294
|
+
debug?.batchCreated({
|
|
1295
|
+
batchIndex: batches.length,
|
|
1296
|
+
artifactCount: currentBatch.length,
|
|
1297
|
+
totalTokens: currentTokens,
|
|
1298
|
+
totalImages: currentImages,
|
|
1299
|
+
artifactIds: currentBatch.map((a) => a.id)
|
|
1300
|
+
});
|
|
1301
|
+
batches.push(currentBatch);
|
|
1302
|
+
}
|
|
1303
|
+
debug?.batchingComplete({
|
|
1304
|
+
totalBatches: batches.length,
|
|
1305
|
+
batches: batches.map((batch, index) => ({
|
|
1306
|
+
index,
|
|
1307
|
+
artifactCount: batch.length,
|
|
1308
|
+
tokens: batch.reduce((sum, a) => sum + (a.tokens ?? 0), 0),
|
|
1309
|
+
images: batch.reduce(
|
|
1310
|
+
(sum, a) => sum + a.contents.reduce((c, content) => c + (content.media?.length ?? 0), 0),
|
|
1311
|
+
0
|
|
1312
|
+
)
|
|
1313
|
+
}))
|
|
1314
|
+
});
|
|
1315
|
+
return batches;
|
|
1316
|
+
};
|
|
1317
|
+
|
|
1318
|
+
// src/llm/message.ts
|
|
1319
|
+
var collectImages = (artifacts) => {
|
|
1320
|
+
const parts = [];
|
|
1321
|
+
for (const artifact of artifacts) {
|
|
1322
|
+
for (const content of artifact.contents) {
|
|
1323
|
+
if (!content.media?.length) {
|
|
1324
|
+
continue;
|
|
1325
|
+
}
|
|
1326
|
+
for (const media of content.media) {
|
|
1327
|
+
if (media.contents) {
|
|
1328
|
+
parts.push({ type: "image", image: media.contents });
|
|
1329
|
+
} else if (media.base64) {
|
|
1330
|
+
parts.push({ type: "image", image: media.base64 });
|
|
1331
|
+
} else if (media.url) {
|
|
1332
|
+
parts.push({ type: "image", image: media.url });
|
|
1333
|
+
}
|
|
1334
|
+
}
|
|
1335
|
+
}
|
|
1336
|
+
}
|
|
1337
|
+
return parts;
|
|
1338
|
+
};
|
|
1339
|
+
var buildUserContent = (text, artifacts) => {
|
|
1340
|
+
const images = collectImages(artifacts);
|
|
1341
|
+
if (images.length === 0) {
|
|
1342
|
+
return text;
|
|
1343
|
+
}
|
|
1344
|
+
return [{ type: "text", text }, ...images];
|
|
1345
|
+
};
|
|
1346
|
+
|
|
1347
|
+
// src/llm/LLMClient.ts
|
|
1348
|
+
import { generateText, Output, jsonSchema } from "ai";
|
|
1349
|
+
var isZodSchema = (schema) => {
|
|
1350
|
+
return typeof schema === "object" && schema !== null && "safeParse" in schema && typeof schema.safeParse === "function";
|
|
1351
|
+
};
|
|
1352
|
+
var generateStructured = async (request) => {
|
|
1353
|
+
const { telemetry, parentSpan } = request;
|
|
1354
|
+
const llmSpan = telemetry?.startSpan({
|
|
1355
|
+
name: "llm.generateStructured",
|
|
1356
|
+
kind: "LLM",
|
|
1357
|
+
parentSpan,
|
|
1358
|
+
attributes: {
|
|
1359
|
+
"llm.schema_name": request.schemaName ?? "extract",
|
|
1360
|
+
"llm.strict": request.strict ?? false
|
|
1361
|
+
}
|
|
1362
|
+
});
|
|
1363
|
+
const startTime = Date.now();
|
|
1364
|
+
const schema = isZodSchema(request.schema) ? request.schema : jsonSchema(request.schema);
|
|
1365
|
+
const preferredProvider = request.model?.__openrouter_provider;
|
|
1366
|
+
if (preferredProvider && process.env.DEBUG) {
|
|
1367
|
+
console.error(
|
|
1368
|
+
`[DEBUG] Routing to OpenRouter provider: ${preferredProvider}`
|
|
1369
|
+
);
|
|
1370
|
+
}
|
|
1371
|
+
const providerOptions = preferredProvider ? {
|
|
1372
|
+
openrouter: {
|
|
1373
|
+
provider: {
|
|
1374
|
+
order: [preferredProvider]
|
|
1375
|
+
}
|
|
1376
|
+
}
|
|
1377
|
+
} : void 0;
|
|
1378
|
+
let result;
|
|
1379
|
+
try {
|
|
1380
|
+
result = await generateText({
|
|
1381
|
+
model: request.model,
|
|
1382
|
+
output: Output.object({
|
|
1383
|
+
schema,
|
|
1384
|
+
name: request.schemaName ?? "extract",
|
|
1385
|
+
description: request.schemaDescription
|
|
1386
|
+
}),
|
|
1387
|
+
providerOptions: {
|
|
1388
|
+
openai: {
|
|
1389
|
+
strictJsonSchema: request.strict ?? false
|
|
1390
|
+
}
|
|
1391
|
+
},
|
|
1392
|
+
system: request.system,
|
|
1393
|
+
messages: request.messages ?? [
|
|
1394
|
+
{ role: "user", content: request.user }
|
|
1395
|
+
],
|
|
1396
|
+
...providerOptions ? { providerOptions } : {}
|
|
1397
|
+
});
|
|
1398
|
+
} catch (error) {
|
|
1399
|
+
const modelId = typeof request.model === "object" && request.model !== null ? request.model.modelId ?? JSON.stringify(request.model) : String(request.model);
|
|
1400
|
+
if (error && typeof error === "object" && "responseBody" in error && "statusCode" in error) {
|
|
1401
|
+
const apiError = error;
|
|
1402
|
+
const responseBody = apiError.responseBody;
|
|
1403
|
+
const errorData = apiError.data;
|
|
1404
|
+
if (typeof responseBody === "string" && responseBody.includes("No endpoints found that support image input")) {
|
|
1405
|
+
throw new Error(
|
|
1406
|
+
`Model "${modelId}" does not support image input. Please use a model that supports images (e.g., gpt-4o, claude-3-5-sonnet, gemini-1.5-pro) or remove the --images and --screenshots flags.`
|
|
1407
|
+
);
|
|
1408
|
+
}
|
|
1409
|
+
if (errorData?.code === 500 || errorData?.message?.includes("Internal Server Error")) {
|
|
1410
|
+
throw new Error(
|
|
1411
|
+
`Provider error for model "${modelId}": Internal server error. The model or provider may be experiencing issues. Please try again or use a different model.`
|
|
1412
|
+
);
|
|
1413
|
+
}
|
|
1414
|
+
if (apiError.statusCode === 401 || errorData?.code === 401) {
|
|
1415
|
+
throw new Error(
|
|
1416
|
+
`Authentication failed for model "${modelId}". Please check your API key is valid and has the necessary permissions.`
|
|
1417
|
+
);
|
|
1418
|
+
}
|
|
1419
|
+
if (apiError.statusCode === 403 || errorData?.code === 403) {
|
|
1420
|
+
throw new Error(
|
|
1421
|
+
`Access denied for model "${modelId}". Your API key may not have access to this model. Please check your subscription or try a different model.`
|
|
1422
|
+
);
|
|
1423
|
+
}
|
|
1424
|
+
if (apiError.statusCode === 429 || errorData?.code === 429) {
|
|
1425
|
+
throw new Error(
|
|
1426
|
+
`Rate limit exceeded for model "${modelId}". Please wait a moment and try again, or use a different model.`
|
|
1427
|
+
);
|
|
1428
|
+
}
|
|
1429
|
+
if (apiError.statusCode === 404 || errorData?.code === 404) {
|
|
1430
|
+
const errorMsg = errorData?.message || "Model not found";
|
|
1431
|
+
throw new Error(
|
|
1432
|
+
`Model "${modelId}" not found or unavailable. ${errorMsg} Please check the model name or try a different model.`
|
|
1433
|
+
);
|
|
1434
|
+
}
|
|
1435
|
+
if (errorData?.message) {
|
|
1436
|
+
throw new Error(
|
|
1437
|
+
`Provider error for model "${modelId}": ${errorData.message}`
|
|
1438
|
+
);
|
|
1439
|
+
}
|
|
1440
|
+
}
|
|
1441
|
+
if (llmSpan && telemetry) {
|
|
1442
|
+
const latencyMs = Date.now() - startTime;
|
|
1443
|
+
telemetry.recordEvent(llmSpan, {
|
|
1444
|
+
type: "llm_call",
|
|
1445
|
+
model: modelId,
|
|
1446
|
+
provider: "unknown",
|
|
1447
|
+
// Will be determined by the model
|
|
1448
|
+
input: {
|
|
1449
|
+
messages: request.messages ?? [{ role: "user", content: typeof request.user === "string" ? request.user : "" }],
|
|
1450
|
+
temperature: void 0,
|
|
1451
|
+
maxTokens: void 0,
|
|
1452
|
+
schema: request.schema
|
|
1453
|
+
},
|
|
1454
|
+
error: error instanceof Error ? error : new Error(String(error)),
|
|
1455
|
+
latencyMs
|
|
1456
|
+
});
|
|
1457
|
+
telemetry.endSpan(llmSpan, {
|
|
1458
|
+
status: "error",
|
|
1459
|
+
error: error instanceof Error ? error : new Error(String(error)),
|
|
1460
|
+
latencyMs
|
|
1461
|
+
});
|
|
1462
|
+
}
|
|
1463
|
+
throw error;
|
|
1464
|
+
}
|
|
1465
|
+
const usageRaw = result.usage ?? {};
|
|
1466
|
+
const inputTokens = "promptTokens" in usageRaw ? usageRaw.promptTokens : usageRaw.inputTokens ?? 0;
|
|
1467
|
+
const outputTokens = "completionTokens" in usageRaw ? usageRaw.completionTokens : usageRaw.outputTokens ?? 0;
|
|
1468
|
+
const totalTokens = "totalTokens" in usageRaw ? usageRaw.totalTokens : inputTokens + outputTokens;
|
|
1469
|
+
const usage = {
|
|
1470
|
+
inputTokens,
|
|
1471
|
+
outputTokens,
|
|
1472
|
+
totalTokens
|
|
1473
|
+
};
|
|
1474
|
+
if (llmSpan && telemetry) {
|
|
1475
|
+
const latencyMs = Date.now() - startTime;
|
|
1476
|
+
telemetry.recordEvent(llmSpan, {
|
|
1477
|
+
type: "llm_call",
|
|
1478
|
+
model: typeof request.model === "object" && request.model !== null ? request.model.modelId ?? "unknown" : String(request.model),
|
|
1479
|
+
provider: preferredProvider ?? "unknown",
|
|
1480
|
+
input: {
|
|
1481
|
+
messages: request.messages ?? [{ role: "user", content: typeof request.user === "string" ? request.user : "" }],
|
|
1482
|
+
temperature: void 0,
|
|
1483
|
+
maxTokens: void 0,
|
|
1484
|
+
schema: request.schema
|
|
1485
|
+
},
|
|
1486
|
+
output: {
|
|
1487
|
+
content: JSON.stringify(result.output),
|
|
1488
|
+
structured: true,
|
|
1489
|
+
usage: {
|
|
1490
|
+
input: inputTokens,
|
|
1491
|
+
output: outputTokens,
|
|
1492
|
+
total: totalTokens
|
|
1493
|
+
}
|
|
1494
|
+
},
|
|
1495
|
+
latencyMs
|
|
1496
|
+
});
|
|
1497
|
+
telemetry.endSpan(llmSpan, {
|
|
1498
|
+
status: "ok",
|
|
1499
|
+
output: result.output,
|
|
1500
|
+
latencyMs
|
|
1501
|
+
});
|
|
1502
|
+
}
|
|
1503
|
+
return { data: result.output, usage };
|
|
1504
|
+
};
|
|
1505
|
+
|
|
1506
|
+
// src/llm/RetryingRunner.ts
|
|
1507
|
+
var runWithRetries = async (options) => {
|
|
1508
|
+
const { telemetry, parentSpan } = options;
|
|
1509
|
+
const retrySpan = telemetry?.startSpan({
|
|
1510
|
+
name: "struktur.validation_retry",
|
|
1511
|
+
kind: "CHAIN",
|
|
1512
|
+
parentSpan,
|
|
1513
|
+
attributes: {
|
|
1514
|
+
"retry.max_attempts": options.maxAttempts ?? 3,
|
|
1515
|
+
"retry.schema_name": options.schemaName ?? "extract"
|
|
1516
|
+
}
|
|
1517
|
+
});
|
|
1518
|
+
const ajv = createAjv();
|
|
1519
|
+
const maxAttempts = options.maxAttempts ?? 3;
|
|
1520
|
+
const messages = [{ role: "user", content: options.user }];
|
|
1521
|
+
const debug = options.debug;
|
|
1522
|
+
const callId = options.callId ?? `call_${Date.now()}_${Math.random().toString(36).slice(2, 11)}`;
|
|
1523
|
+
let usage = { inputTokens: 0, outputTokens: 0, totalTokens: 0 };
|
|
1524
|
+
let lastError;
|
|
1525
|
+
const systemLength = options.system.length;
|
|
1526
|
+
const userLength = typeof options.user === "string" ? options.user.length : JSON.stringify(options.user).length;
|
|
1527
|
+
debug?.llmCallStart({
|
|
1528
|
+
callId,
|
|
1529
|
+
model: JSON.stringify(options.model),
|
|
1530
|
+
schemaName: options.schemaName,
|
|
1531
|
+
systemLength,
|
|
1532
|
+
userLength,
|
|
1533
|
+
artifactCount: Array.isArray(options.user) ? options.user.length : 0
|
|
1534
|
+
});
|
|
1535
|
+
debug?.promptSystem({ callId, system: options.system });
|
|
1536
|
+
debug?.promptUser({ callId, user: options.user });
|
|
1537
|
+
for (let attempt = 1; attempt <= maxAttempts; attempt += 1) {
|
|
1538
|
+
const executor = options.execute ?? generateStructured;
|
|
1539
|
+
const isFinalAttempt = attempt === maxAttempts;
|
|
1540
|
+
const useStrictValidation = options.strict === true || isFinalAttempt;
|
|
1541
|
+
debug?.validationStart({
|
|
1542
|
+
callId,
|
|
1543
|
+
attempt,
|
|
1544
|
+
maxAttempts,
|
|
1545
|
+
strict: useStrictValidation
|
|
1546
|
+
});
|
|
1547
|
+
const startTime = Date.now();
|
|
1548
|
+
const result = await executor({
|
|
1549
|
+
model: options.model,
|
|
1550
|
+
schema: options.schema,
|
|
1551
|
+
schemaName: options.schemaName,
|
|
1552
|
+
system: options.system,
|
|
1553
|
+
user: options.user,
|
|
1554
|
+
messages,
|
|
1555
|
+
strict: options.strict,
|
|
1556
|
+
telemetry,
|
|
1557
|
+
parentSpan: retrySpan
|
|
1558
|
+
});
|
|
1559
|
+
const durationMs = Date.now() - startTime;
|
|
1560
|
+
usage = {
|
|
1561
|
+
inputTokens: usage.inputTokens + result.usage.inputTokens,
|
|
1562
|
+
outputTokens: usage.outputTokens + result.usage.outputTokens,
|
|
1563
|
+
totalTokens: usage.totalTokens + result.usage.totalTokens
|
|
1564
|
+
};
|
|
1565
|
+
debug?.rawResponse({ callId, response: result.data });
|
|
1566
|
+
try {
|
|
1567
|
+
if (useStrictValidation) {
|
|
1568
|
+
const validated = validateOrThrow(
|
|
1569
|
+
ajv,
|
|
1570
|
+
options.schema,
|
|
1571
|
+
result.data
|
|
1572
|
+
);
|
|
1573
|
+
debug?.validationSuccess({ callId, attempt });
|
|
1574
|
+
debug?.llmCallComplete({
|
|
1575
|
+
callId,
|
|
1576
|
+
success: true,
|
|
1577
|
+
inputTokens: usage.inputTokens,
|
|
1578
|
+
outputTokens: usage.outputTokens,
|
|
1579
|
+
totalTokens: usage.totalTokens,
|
|
1580
|
+
durationMs
|
|
1581
|
+
});
|
|
1582
|
+
if (retrySpan && telemetry) {
|
|
1583
|
+
telemetry.recordEvent(retrySpan, {
|
|
1584
|
+
type: "validation",
|
|
1585
|
+
attempt,
|
|
1586
|
+
maxAttempts,
|
|
1587
|
+
schema: options.schema,
|
|
1588
|
+
input: result.data,
|
|
1589
|
+
success: true,
|
|
1590
|
+
latencyMs: durationMs
|
|
1591
|
+
});
|
|
1592
|
+
telemetry.endSpan(retrySpan, {
|
|
1593
|
+
status: "ok",
|
|
1594
|
+
output: validated,
|
|
1595
|
+
latencyMs: durationMs
|
|
1596
|
+
});
|
|
1597
|
+
}
|
|
1598
|
+
return { data: validated, usage };
|
|
1599
|
+
} else {
|
|
1600
|
+
const validationResult = validateAllowingMissingRequired(
|
|
1601
|
+
ajv,
|
|
1602
|
+
options.schema,
|
|
1603
|
+
result.data,
|
|
1604
|
+
isFinalAttempt
|
|
1605
|
+
);
|
|
1606
|
+
if (validationResult.valid) {
|
|
1607
|
+
debug?.validationSuccess({ callId, attempt });
|
|
1608
|
+
debug?.llmCallComplete({
|
|
1609
|
+
callId,
|
|
1610
|
+
success: true,
|
|
1611
|
+
inputTokens: usage.inputTokens,
|
|
1612
|
+
outputTokens: usage.outputTokens,
|
|
1613
|
+
totalTokens: usage.totalTokens,
|
|
1614
|
+
durationMs
|
|
1615
|
+
});
|
|
1616
|
+
if (retrySpan && telemetry) {
|
|
1617
|
+
telemetry.recordEvent(retrySpan, {
|
|
1618
|
+
type: "validation",
|
|
1619
|
+
attempt,
|
|
1620
|
+
maxAttempts,
|
|
1621
|
+
schema: options.schema,
|
|
1622
|
+
input: result.data,
|
|
1623
|
+
success: true,
|
|
1624
|
+
latencyMs: durationMs
|
|
1625
|
+
});
|
|
1626
|
+
telemetry.endSpan(retrySpan, {
|
|
1627
|
+
status: "ok",
|
|
1628
|
+
output: validationResult.data,
|
|
1629
|
+
latencyMs: durationMs
|
|
1630
|
+
});
|
|
1631
|
+
}
|
|
1632
|
+
return { data: validationResult.data, usage };
|
|
1633
|
+
}
|
|
1634
|
+
throw new SchemaValidationError(
|
|
1635
|
+
"Schema validation failed",
|
|
1636
|
+
validationResult.errors
|
|
1637
|
+
);
|
|
1638
|
+
}
|
|
1639
|
+
} catch (error) {
|
|
1640
|
+
lastError = error;
|
|
1641
|
+
if (error instanceof SchemaValidationError) {
|
|
1642
|
+
debug?.validationFailed({
|
|
1643
|
+
callId,
|
|
1644
|
+
attempt,
|
|
1645
|
+
errors: error.errors
|
|
1646
|
+
});
|
|
1647
|
+
if (retrySpan && telemetry) {
|
|
1648
|
+
telemetry.recordEvent(retrySpan, {
|
|
1649
|
+
type: "validation",
|
|
1650
|
+
attempt,
|
|
1651
|
+
maxAttempts,
|
|
1652
|
+
schema: options.schema,
|
|
1653
|
+
input: result.data,
|
|
1654
|
+
success: false,
|
|
1655
|
+
errors: error.errors,
|
|
1656
|
+
latencyMs: durationMs
|
|
1657
|
+
});
|
|
1658
|
+
}
|
|
1659
|
+
const nextAttempt = attempt + 1;
|
|
1660
|
+
if (nextAttempt <= maxAttempts) {
|
|
1661
|
+
await options.events?.onRetry?.({
|
|
1662
|
+
attempt: nextAttempt,
|
|
1663
|
+
maxAttempts,
|
|
1664
|
+
reason: "schema_validation_failed"
|
|
1665
|
+
});
|
|
1666
|
+
debug?.retry({
|
|
1667
|
+
callId,
|
|
1668
|
+
attempt: nextAttempt,
|
|
1669
|
+
maxAttempts,
|
|
1670
|
+
reason: "schema_validation_failed"
|
|
1671
|
+
});
|
|
1672
|
+
}
|
|
1673
|
+
const errorPayload = JSON.stringify(error.errors, null, 2);
|
|
1674
|
+
const errorMessage = `<validation-errors>
|
|
1675
|
+
${errorPayload}
|
|
1676
|
+
</validation-errors>`;
|
|
1677
|
+
messages.push({ role: "user", content: errorMessage });
|
|
1678
|
+
await options.events?.onMessage?.({
|
|
1679
|
+
role: "user",
|
|
1680
|
+
content: errorMessage
|
|
1681
|
+
});
|
|
1682
|
+
continue;
|
|
1683
|
+
}
|
|
1684
|
+
debug?.llmCallComplete({
|
|
1685
|
+
callId,
|
|
1686
|
+
success: false,
|
|
1687
|
+
inputTokens: usage.inputTokens,
|
|
1688
|
+
outputTokens: usage.outputTokens,
|
|
1689
|
+
totalTokens: usage.totalTokens,
|
|
1690
|
+
durationMs,
|
|
1691
|
+
error: error.message
|
|
1692
|
+
});
|
|
1693
|
+
if (retrySpan && telemetry) {
|
|
1694
|
+
telemetry.endSpan(retrySpan, {
|
|
1695
|
+
status: "error",
|
|
1696
|
+
error,
|
|
1697
|
+
latencyMs: durationMs
|
|
1698
|
+
});
|
|
1699
|
+
}
|
|
1700
|
+
break;
|
|
1701
|
+
}
|
|
1702
|
+
}
|
|
1703
|
+
throw lastError ?? new Error("Unknown extraction error");
|
|
1704
|
+
};
|
|
1705
|
+
|
|
1706
|
+
// src/strategies/utils.ts
|
|
1707
|
+
var serializeSchema = (schema) => {
|
|
1708
|
+
return JSON.stringify(schema);
|
|
1709
|
+
};
|
|
1710
|
+
var mergeUsage = (usages) => {
|
|
1711
|
+
return usages.reduce(
|
|
1712
|
+
(acc, usage) => ({
|
|
1713
|
+
inputTokens: acc.inputTokens + usage.inputTokens,
|
|
1714
|
+
outputTokens: acc.outputTokens + usage.outputTokens,
|
|
1715
|
+
totalTokens: acc.totalTokens + usage.totalTokens
|
|
1716
|
+
}),
|
|
1717
|
+
{ inputTokens: 0, outputTokens: 0, totalTokens: 0 }
|
|
1718
|
+
);
|
|
1719
|
+
};
|
|
1720
|
+
var getBatches = (artifacts, options, debug, telemetry, parentSpan) => {
|
|
1721
|
+
const chunkingSpan = telemetry?.startSpan({
|
|
1722
|
+
name: "struktur.chunking",
|
|
1723
|
+
kind: "RETRIEVER",
|
|
1724
|
+
parentSpan,
|
|
1725
|
+
attributes: {
|
|
1726
|
+
"chunking.artifact_count": artifacts.length,
|
|
1727
|
+
"chunking.max_tokens": options.maxTokens,
|
|
1728
|
+
"chunking.max_images": options.maxImages
|
|
1729
|
+
}
|
|
1730
|
+
});
|
|
1731
|
+
const batches = batchArtifacts(artifacts, { ...options, debug });
|
|
1732
|
+
if (chunkingSpan && telemetry) {
|
|
1733
|
+
batches.forEach((batch, index) => {
|
|
1734
|
+
telemetry.recordEvent(chunkingSpan, {
|
|
1735
|
+
type: "chunk",
|
|
1736
|
+
chunkIndex: index,
|
|
1737
|
+
totalChunks: batches.length,
|
|
1738
|
+
tokens: batch.reduce((sum, a) => sum + (a.tokens || 0), 0),
|
|
1739
|
+
images: batch.reduce((sum, a) => sum + (a.contents?.flatMap((c) => c.media || []).length || 0), 0)
|
|
1740
|
+
});
|
|
1741
|
+
});
|
|
1742
|
+
telemetry.endSpan(chunkingSpan, {
|
|
1743
|
+
status: "ok",
|
|
1744
|
+
output: { batchCount: batches.length }
|
|
1745
|
+
});
|
|
1746
|
+
}
|
|
1747
|
+
return batches;
|
|
1748
|
+
};
|
|
1749
|
+
var extractWithPrompt = async (options) => {
|
|
1750
|
+
const userContent = buildUserContent(options.user, options.artifacts);
|
|
1751
|
+
const result = await runWithRetries({
|
|
1752
|
+
model: options.model,
|
|
1753
|
+
schema: options.schema,
|
|
1754
|
+
system: options.system,
|
|
1755
|
+
user: userContent,
|
|
1756
|
+
events: options.events,
|
|
1757
|
+
execute: options.execute,
|
|
1758
|
+
strict: options.strict,
|
|
1759
|
+
debug: options.debug,
|
|
1760
|
+
callId: options.callId,
|
|
1761
|
+
telemetry: options.telemetry,
|
|
1762
|
+
parentSpan: options.parentSpan
|
|
1763
|
+
});
|
|
1764
|
+
return result;
|
|
1765
|
+
};
|
|
1766
|
+
|
|
1767
|
+
// src/strategies/SimpleStrategy.ts
|
|
1768
|
+
var SimpleStrategy = class {
|
|
1769
|
+
name = "simple";
|
|
1770
|
+
config;
|
|
1771
|
+
constructor(config) {
|
|
1772
|
+
this.config = config;
|
|
1773
|
+
}
|
|
1774
|
+
getEstimatedSteps() {
|
|
1775
|
+
return 3;
|
|
1776
|
+
}
|
|
1777
|
+
async run(options) {
|
|
1778
|
+
const debug = options.debug;
|
|
1779
|
+
const telemetry = options.telemetry ?? void 0;
|
|
1780
|
+
const strategySpan = telemetry?.startSpan({
|
|
1781
|
+
name: "strategy.simple",
|
|
1782
|
+
kind: "CHAIN",
|
|
1783
|
+
attributes: {
|
|
1784
|
+
"strategy.name": this.name,
|
|
1785
|
+
"strategy.artifacts.count": options.artifacts.length
|
|
1786
|
+
}
|
|
1787
|
+
});
|
|
1788
|
+
const schema = serializeSchema(options.schema);
|
|
1789
|
+
const { system, user } = buildExtractorPrompt(
|
|
1790
|
+
options.artifacts,
|
|
1791
|
+
schema,
|
|
1792
|
+
this.config.outputInstructions
|
|
1793
|
+
);
|
|
1794
|
+
await options.events?.onStep?.({
|
|
1795
|
+
step: 1,
|
|
1796
|
+
total: this.getEstimatedSteps(),
|
|
1797
|
+
label: "extract"
|
|
1798
|
+
});
|
|
1799
|
+
debug?.step({
|
|
1800
|
+
step: 1,
|
|
1801
|
+
total: this.getEstimatedSteps(),
|
|
1802
|
+
label: "extract",
|
|
1803
|
+
strategy: this.name
|
|
1804
|
+
});
|
|
1805
|
+
const result = await extractWithPrompt({
|
|
1806
|
+
model: this.config.model,
|
|
1807
|
+
schema: options.schema,
|
|
1808
|
+
system,
|
|
1809
|
+
user,
|
|
1810
|
+
artifacts: options.artifacts,
|
|
1811
|
+
events: options.events,
|
|
1812
|
+
execute: this.config.execute,
|
|
1813
|
+
strict: options.strict ?? this.config.strict,
|
|
1814
|
+
debug,
|
|
1815
|
+
callId: "simple_extract",
|
|
1816
|
+
telemetry,
|
|
1817
|
+
parentSpan: strategySpan
|
|
1818
|
+
});
|
|
1819
|
+
debug?.step({
|
|
1820
|
+
step: 2,
|
|
1821
|
+
total: this.getEstimatedSteps(),
|
|
1822
|
+
label: "complete",
|
|
1823
|
+
strategy: this.name
|
|
1824
|
+
});
|
|
1825
|
+
telemetry?.endSpan(strategySpan, {
|
|
1826
|
+
status: "ok",
|
|
1827
|
+
output: result.data
|
|
1828
|
+
});
|
|
1829
|
+
return { data: result.data, usage: result.usage };
|
|
1830
|
+
}
|
|
1831
|
+
};
|
|
1832
|
+
var simple = (config) => {
|
|
1833
|
+
return new SimpleStrategy(config);
|
|
1834
|
+
};
|
|
1835
|
+
|
|
1836
|
+
// src/prompts/ParallelMergerPrompt.ts
|
|
1837
|
+
var buildParallelMergerPrompt = (schema, dataList) => {
|
|
1838
|
+
const jsonObjects = dataList.filter((item) => item !== null && item !== void 0).map((item) => JSON.stringify(item)).map((json) => `<json-object>${json}</json-object>`).join("\n");
|
|
1839
|
+
const system = `You are a data merger. Combine multiple JSON objects into one object matching the provided schema.
|
|
1840
|
+
|
|
1841
|
+
<thinking>
|
|
1842
|
+
Before merging, consider:
|
|
1843
|
+
1. Which input objects contain data for each schema field?
|
|
1844
|
+
2. How should conflicting values be resolved (prefer more complete/recent data)?
|
|
1845
|
+
3. Are there arrays that need to be concatenated vs deduplicated?
|
|
1846
|
+
4. Ensure NO information is lost from any input
|
|
1847
|
+
</thinking>
|
|
1848
|
+
|
|
1849
|
+
<rules>
|
|
1850
|
+
- Produce a single JSON object following the schema exactly
|
|
1851
|
+
- Combine all information from input objects without losing data
|
|
1852
|
+
- Resolve conflicts intelligently (prefer richer/more specific data)
|
|
1853
|
+
- Output ONLY valid JSON - no markdown, no explanations
|
|
1854
|
+
</rules>`;
|
|
1855
|
+
const user = `<json-schema>
|
|
1856
|
+
${schema}
|
|
1857
|
+
</json-schema>
|
|
1858
|
+
|
|
1859
|
+
<json-objects>
|
|
1860
|
+
${jsonObjects}
|
|
1861
|
+
</json-objects>`;
|
|
1862
|
+
return { system, user };
|
|
1863
|
+
};
|
|
1864
|
+
|
|
1865
|
+
// src/strategies/concurrency.ts
|
|
1866
|
+
var runConcurrently = async (tasks, concurrency) => {
|
|
1867
|
+
const results = [];
|
|
1868
|
+
for (let i = 0; i < tasks.length; i += concurrency) {
|
|
1869
|
+
const chunk = tasks.slice(i, i + concurrency).map((task) => task());
|
|
1870
|
+
const chunkResults = await Promise.all(chunk);
|
|
1871
|
+
results.push(...chunkResults);
|
|
1872
|
+
}
|
|
1873
|
+
return results;
|
|
1874
|
+
};
|
|
1875
|
+
|
|
1876
|
+
// src/strategies/ParallelStrategy.ts
|
|
1877
|
+
var ParallelStrategy = class {
|
|
1878
|
+
name = "parallel";
|
|
1879
|
+
config;
|
|
1880
|
+
constructor(config) {
|
|
1881
|
+
this.config = config;
|
|
1882
|
+
}
|
|
1883
|
+
getEstimatedSteps(artifacts) {
|
|
1884
|
+
const batches = getBatches(artifacts, {
|
|
1885
|
+
maxTokens: this.config.chunkSize,
|
|
1886
|
+
maxImages: this.config.maxImages
|
|
1887
|
+
});
|
|
1888
|
+
return batches.length + 3;
|
|
1889
|
+
}
|
|
1890
|
+
async run(options) {
|
|
1891
|
+
const debug = options.debug;
|
|
1892
|
+
const { telemetry } = options;
|
|
1893
|
+
const strategySpan = telemetry?.startSpan({
|
|
1894
|
+
name: "strategy.parallel",
|
|
1895
|
+
kind: "CHAIN",
|
|
1896
|
+
attributes: {
|
|
1897
|
+
"strategy.name": this.name,
|
|
1898
|
+
"strategy.artifacts.count": options.artifacts.length,
|
|
1899
|
+
"strategy.chunk_size": this.config.chunkSize,
|
|
1900
|
+
"strategy.concurrency": this.config.concurrency
|
|
1901
|
+
}
|
|
1902
|
+
});
|
|
1903
|
+
const batches = getBatches(
|
|
1904
|
+
options.artifacts,
|
|
1905
|
+
{
|
|
1906
|
+
maxTokens: this.config.chunkSize,
|
|
1907
|
+
maxImages: this.config.maxImages
|
|
1908
|
+
},
|
|
1909
|
+
debug,
|
|
1910
|
+
telemetry ?? void 0,
|
|
1911
|
+
strategySpan
|
|
1912
|
+
);
|
|
1913
|
+
const schema = serializeSchema(options.schema);
|
|
1914
|
+
const totalSteps = this.getEstimatedSteps(options.artifacts);
|
|
1915
|
+
let step = 1;
|
|
1916
|
+
await options.events?.onStep?.({
|
|
1917
|
+
step,
|
|
1918
|
+
total: totalSteps,
|
|
1919
|
+
label: batches.length > 1 ? `batch 1/${batches.length}` : "extract"
|
|
1920
|
+
});
|
|
1921
|
+
debug?.step({
|
|
1922
|
+
step,
|
|
1923
|
+
total: totalSteps,
|
|
1924
|
+
label: batches.length > 1 ? `batch 1/${batches.length}` : "extract",
|
|
1925
|
+
strategy: this.name
|
|
1926
|
+
});
|
|
1927
|
+
const tasks = batches.map((batch, index) => async () => {
|
|
1928
|
+
const prompt = buildExtractorPrompt(
|
|
1929
|
+
batch,
|
|
1930
|
+
schema,
|
|
1931
|
+
this.config.outputInstructions
|
|
1932
|
+
);
|
|
1933
|
+
const result = await extractWithPrompt({
|
|
1934
|
+
model: this.config.model,
|
|
1935
|
+
schema: options.schema,
|
|
1936
|
+
system: prompt.system,
|
|
1937
|
+
user: prompt.user,
|
|
1938
|
+
artifacts: batch,
|
|
1939
|
+
events: options.events,
|
|
1940
|
+
execute: this.config.execute,
|
|
1941
|
+
strict: options.strict ?? this.config.strict,
|
|
1942
|
+
debug,
|
|
1943
|
+
callId: `parallel_batch_${index + 1}`,
|
|
1944
|
+
telemetry: telemetry ?? void 0,
|
|
1945
|
+
parentSpan: strategySpan
|
|
1946
|
+
});
|
|
1947
|
+
const completedIndex = index + 1;
|
|
1948
|
+
if (completedIndex < batches.length) {
|
|
1949
|
+
step += 1;
|
|
1950
|
+
await options.events?.onStep?.({
|
|
1951
|
+
step,
|
|
1952
|
+
total: totalSteps,
|
|
1953
|
+
label: `batch ${completedIndex + 1}/${batches.length}`
|
|
1954
|
+
});
|
|
1955
|
+
debug?.step({
|
|
1956
|
+
step,
|
|
1957
|
+
total: totalSteps,
|
|
1958
|
+
label: `batch ${completedIndex + 1}/${batches.length}`,
|
|
1959
|
+
strategy: this.name
|
|
1960
|
+
});
|
|
1961
|
+
}
|
|
1962
|
+
return result;
|
|
1963
|
+
});
|
|
1964
|
+
const results = await runConcurrently(
|
|
1965
|
+
tasks,
|
|
1966
|
+
this.config.concurrency ?? batches.length
|
|
1967
|
+
);
|
|
1968
|
+
debug?.mergeStart({
|
|
1969
|
+
mergeId: "parallel_merge",
|
|
1970
|
+
inputCount: results.length,
|
|
1971
|
+
strategy: this.name
|
|
1972
|
+
});
|
|
1973
|
+
const mergeSpan = telemetry?.startSpan({
|
|
1974
|
+
name: "struktur.merge",
|
|
1975
|
+
kind: "CHAIN",
|
|
1976
|
+
parentSpan: strategySpan,
|
|
1977
|
+
attributes: {
|
|
1978
|
+
"merge.strategy": "parallel",
|
|
1979
|
+
"merge.input_count": results.length
|
|
1980
|
+
}
|
|
1981
|
+
});
|
|
1982
|
+
const mergePrompt = buildParallelMergerPrompt(
|
|
1983
|
+
schema,
|
|
1984
|
+
results.map((r) => r.data)
|
|
1985
|
+
);
|
|
1986
|
+
const merged = await extractWithPrompt({
|
|
1987
|
+
model: this.config.mergeModel,
|
|
1988
|
+
schema: options.schema,
|
|
1989
|
+
system: mergePrompt.system,
|
|
1990
|
+
user: mergePrompt.user,
|
|
1991
|
+
artifacts: [],
|
|
1992
|
+
events: options.events,
|
|
1993
|
+
execute: this.config.execute,
|
|
1994
|
+
strict: this.config.strict,
|
|
1995
|
+
debug,
|
|
1996
|
+
callId: "parallel_merge",
|
|
1997
|
+
telemetry: telemetry ?? void 0,
|
|
1998
|
+
parentSpan: mergeSpan
|
|
1999
|
+
});
|
|
2000
|
+
step += 1;
|
|
2001
|
+
await options.events?.onStep?.({
|
|
2002
|
+
step,
|
|
2003
|
+
total: totalSteps,
|
|
2004
|
+
label: "merge"
|
|
2005
|
+
});
|
|
2006
|
+
debug?.step({
|
|
2007
|
+
step,
|
|
2008
|
+
total: totalSteps,
|
|
2009
|
+
label: "merge",
|
|
2010
|
+
strategy: this.name
|
|
2011
|
+
});
|
|
2012
|
+
debug?.mergeComplete({ mergeId: "parallel_merge", success: true });
|
|
2013
|
+
if (mergeSpan && telemetry) {
|
|
2014
|
+
telemetry.recordEvent(mergeSpan, {
|
|
2015
|
+
type: "merge",
|
|
2016
|
+
strategy: "parallel",
|
|
2017
|
+
inputCount: results.length,
|
|
2018
|
+
outputCount: 1
|
|
2019
|
+
});
|
|
2020
|
+
telemetry.endSpan(mergeSpan, {
|
|
2021
|
+
status: "ok",
|
|
2022
|
+
output: merged.data
|
|
2023
|
+
});
|
|
2024
|
+
}
|
|
2025
|
+
telemetry?.endSpan(strategySpan, {
|
|
2026
|
+
status: "ok",
|
|
2027
|
+
output: merged.data
|
|
2028
|
+
});
|
|
2029
|
+
return {
|
|
2030
|
+
data: merged.data,
|
|
2031
|
+
usage: mergeUsage([...results.map((r) => r.usage), merged.usage])
|
|
2032
|
+
};
|
|
2033
|
+
}
|
|
2034
|
+
};
|
|
2035
|
+
var parallel = (config) => {
|
|
2036
|
+
return new ParallelStrategy(config);
|
|
2037
|
+
};
|
|
2038
|
+
|
|
2039
|
+
// src/prompts/SequentialExtractorPrompt.ts
|
|
2040
|
+
var sequentialSystemPrompt = (schema, outputInstructions) => {
|
|
2041
|
+
return `<instructions>
|
|
2042
|
+
You are a precise data extraction engine. Extract data from provided artifacts according to the JSON schema, enriching any previous data you receive.
|
|
2043
|
+
|
|
2044
|
+
<thinking>
|
|
2045
|
+
Before extracting, consider:
|
|
2046
|
+
1. Review previous data - what needs to be preserved vs enriched?
|
|
2047
|
+
2. Which new fields have clear values in the artifacts?
|
|
2048
|
+
3. Which fields remain missing or unclear (keep null from previous or set to null)?
|
|
2049
|
+
4. Can new information improve the structure of existing data?
|
|
2050
|
+
5. Ensure NO information is lost from previous data
|
|
2051
|
+
</thinking>
|
|
2052
|
+
|
|
2053
|
+
<rules>
|
|
2054
|
+
- Merge new artifacts into existing data - do not create fresh objects
|
|
2055
|
+
- Preserve ALL previous data - losing information breaks the processing chain
|
|
2056
|
+
- Use null for missing/uncertain values in new fields
|
|
2057
|
+
- Only extract information explicitly present in the artifacts
|
|
2058
|
+
- Output ONLY valid JSON matching the schema
|
|
2059
|
+
- No markdown, explanations, or code fences
|
|
2060
|
+
</rules>
|
|
2061
|
+
|
|
2062
|
+
<image-handling>
|
|
2063
|
+
Some schema properties may reference artifact IDs (e.g., 'xxx_artifact_id' fields).
|
|
2064
|
+
When assigning images to properties:
|
|
2065
|
+
- Use format: artifact:ID/images/imageNUM.EXT (e.g., 'artifact:123456/images/image1.jpg')
|
|
2066
|
+
- Only reference images you can actually see in the provided documents/images
|
|
2067
|
+
- Image references are visible in artifact XML or written on images
|
|
2068
|
+
- NEVER make up artifact IDs or use normal URLs
|
|
2069
|
+
</image-handling>
|
|
2070
|
+
|
|
2071
|
+
<output-instructions>
|
|
2072
|
+
${outputInstructions ?? "No additional output instructions provided."}
|
|
2073
|
+
</output-instructions>
|
|
2074
|
+
|
|
2075
|
+
<json-schema>
|
|
2076
|
+
${schema}
|
|
2077
|
+
</json-schema>
|
|
2078
|
+
|
|
2079
|
+
<how-to-output>
|
|
2080
|
+
Return the complete extracted data as valid JSON matching the schema.
|
|
2081
|
+
Include all information from previous data, enriched with the new artifacts.
|
|
2082
|
+
</how-to-output>
|
|
2083
|
+
</instructions>`;
|
|
2084
|
+
};
|
|
2085
|
+
var sequentialUserPrompt = (artifactsXml, previousData, outputInstructions) => {
|
|
2086
|
+
return `${artifactsXml}
|
|
2087
|
+
|
|
2088
|
+
<previous-data>
|
|
2089
|
+
${previousData}
|
|
2090
|
+
</previous-data>
|
|
2091
|
+
|
|
2092
|
+
<task>
|
|
2093
|
+
Extract the contents of the given artifacts and ADD/MERGE them into the previous data contained in the <previous-data> tag.
|
|
2094
|
+
You MUST NOT lose any information from the previous data. All previous data must be included in your response.
|
|
2095
|
+
</task>
|
|
2096
|
+
|
|
2097
|
+
<output-instructions>
|
|
2098
|
+
${outputInstructions ?? ""}
|
|
2099
|
+
</output-instructions>`;
|
|
2100
|
+
};
|
|
2101
|
+
var buildSequentialPrompt = (artifacts, schema, previousData, outputInstructions) => {
|
|
2102
|
+
const artifactsXml = formatArtifactsXml(artifacts);
|
|
2103
|
+
return {
|
|
2104
|
+
system: sequentialSystemPrompt(schema, outputInstructions),
|
|
2105
|
+
user: sequentialUserPrompt(artifactsXml, previousData, outputInstructions)
|
|
2106
|
+
};
|
|
2107
|
+
};
|
|
2108
|
+
|
|
2109
|
+
// src/strategies/SequentialStrategy.ts
|
|
2110
|
+
var SequentialStrategy = class {
|
|
2111
|
+
name = "sequential";
|
|
2112
|
+
config;
|
|
2113
|
+
constructor(config) {
|
|
2114
|
+
this.config = config;
|
|
2115
|
+
}
|
|
2116
|
+
getEstimatedSteps(artifacts) {
|
|
2117
|
+
const batches = getBatches(artifacts, {
|
|
2118
|
+
maxTokens: this.config.chunkSize,
|
|
2119
|
+
maxImages: this.config.maxImages
|
|
2120
|
+
});
|
|
2121
|
+
return batches.length + 2;
|
|
2122
|
+
}
|
|
2123
|
+
async run(options) {
|
|
2124
|
+
const debug = options.debug;
|
|
2125
|
+
const { telemetry } = options;
|
|
2126
|
+
const strategySpan = telemetry?.startSpan({
|
|
2127
|
+
name: "strategy.sequential",
|
|
2128
|
+
kind: "CHAIN",
|
|
2129
|
+
attributes: {
|
|
2130
|
+
"strategy.name": this.name,
|
|
2131
|
+
"strategy.artifacts.count": options.artifacts.length,
|
|
2132
|
+
"strategy.chunk_size": this.config.chunkSize
|
|
2133
|
+
}
|
|
2134
|
+
});
|
|
2135
|
+
const batches = getBatches(
|
|
2136
|
+
options.artifacts,
|
|
2137
|
+
{
|
|
2138
|
+
maxTokens: this.config.chunkSize,
|
|
2139
|
+
maxImages: this.config.maxImages
|
|
2140
|
+
},
|
|
2141
|
+
debug,
|
|
2142
|
+
telemetry ?? void 0,
|
|
2143
|
+
strategySpan
|
|
2144
|
+
);
|
|
2145
|
+
const schema = serializeSchema(options.schema);
|
|
2146
|
+
let currentData;
|
|
2147
|
+
const usages = [];
|
|
2148
|
+
const totalSteps = this.getEstimatedSteps(options.artifacts);
|
|
2149
|
+
let step = 1;
|
|
2150
|
+
await options.events?.onStep?.({
|
|
2151
|
+
step,
|
|
2152
|
+
total: totalSteps,
|
|
2153
|
+
label: batches.length > 1 ? `batch 1/${batches.length}` : "extract"
|
|
2154
|
+
});
|
|
2155
|
+
debug?.step({
|
|
2156
|
+
step,
|
|
2157
|
+
total: totalSteps,
|
|
2158
|
+
label: batches.length > 1 ? `batch 1/${batches.length}` : "extract",
|
|
2159
|
+
strategy: this.name
|
|
2160
|
+
});
|
|
2161
|
+
for (const [index, batch] of batches.entries()) {
|
|
2162
|
+
const previousData = currentData ? JSON.stringify(currentData) : "{}";
|
|
2163
|
+
const prompt = buildSequentialPrompt(
|
|
2164
|
+
batch,
|
|
2165
|
+
schema,
|
|
2166
|
+
previousData,
|
|
2167
|
+
this.config.outputInstructions
|
|
2168
|
+
);
|
|
2169
|
+
const result = await extractWithPrompt({
|
|
2170
|
+
model: this.config.model,
|
|
2171
|
+
schema: options.schema,
|
|
2172
|
+
system: prompt.system,
|
|
2173
|
+
user: prompt.user,
|
|
2174
|
+
artifacts: batch,
|
|
2175
|
+
events: options.events,
|
|
2176
|
+
execute: this.config.execute,
|
|
2177
|
+
strict: options.strict ?? this.config.strict,
|
|
2178
|
+
debug,
|
|
2179
|
+
callId: `sequential_batch_${index + 1}`,
|
|
2180
|
+
telemetry: telemetry ?? void 0,
|
|
2181
|
+
parentSpan: strategySpan
|
|
2182
|
+
});
|
|
2183
|
+
currentData = result.data;
|
|
2184
|
+
usages.push(result.usage);
|
|
2185
|
+
step += 1;
|
|
2186
|
+
if (index < batches.length - 1) {
|
|
2187
|
+
await options.events?.onStep?.({
|
|
2188
|
+
step,
|
|
2189
|
+
total: totalSteps,
|
|
2190
|
+
label: `batch ${index + 2}/${batches.length}`
|
|
2191
|
+
});
|
|
2192
|
+
debug?.step({
|
|
2193
|
+
step,
|
|
2194
|
+
total: totalSteps,
|
|
2195
|
+
label: `batch ${index + 2}/${batches.length}`,
|
|
2196
|
+
strategy: this.name
|
|
2197
|
+
});
|
|
2198
|
+
}
|
|
2199
|
+
}
|
|
2200
|
+
if (!currentData) {
|
|
2201
|
+
throw new Error("No data extracted from sequential strategy");
|
|
2202
|
+
}
|
|
2203
|
+
telemetry?.endSpan(strategySpan, {
|
|
2204
|
+
status: "ok",
|
|
2205
|
+
output: currentData
|
|
2206
|
+
});
|
|
2207
|
+
return { data: currentData, usage: mergeUsage(usages) };
|
|
2208
|
+
}
|
|
2209
|
+
};
|
|
2210
|
+
var sequential = (config) => {
|
|
2211
|
+
return new SequentialStrategy(config);
|
|
2212
|
+
};
|
|
2213
|
+
|
|
2214
|
+
// src/prompts/DeduplicationPrompt.ts
|
|
2215
|
+
var buildDeduplicationPrompt = (schema, data, exampleKeys = ["items.3", "items.5"]) => {
|
|
2216
|
+
const system = `You are a deduplication engine. Identify duplicate entries in structured data.
|
|
2217
|
+
|
|
2218
|
+
<thinking>
|
|
2219
|
+
Before deduplicating, consider:
|
|
2220
|
+
1. Which fields indicate uniqueness for each entity type?
|
|
2221
|
+
2. Are entries duplicates if they share key fields but differ in minor details?
|
|
2222
|
+
3. Which entry should be kept (prefer more complete data)?
|
|
2223
|
+
</thinking>
|
|
2224
|
+
|
|
2225
|
+
<rules>
|
|
2226
|
+
- Identify entries that represent the same entity
|
|
2227
|
+
- Return paths to duplicates using dot notation (e.g., "items.3", "items.5")
|
|
2228
|
+
- Output ONLY JSON in format: { "keys": ["path1", "path2"] }
|
|
2229
|
+
- No markdown, no explanations
|
|
2230
|
+
</rules>`;
|
|
2231
|
+
const user = `<json-schema>
|
|
2232
|
+
${schema}
|
|
2233
|
+
</json-schema>
|
|
2234
|
+
|
|
2235
|
+
<json-data>
|
|
2236
|
+
${JSON.stringify(data)}
|
|
2237
|
+
</json-data>
|
|
2238
|
+
|
|
2239
|
+
<task>Identify duplicate entries in the data and return their paths in the format: { "keys": ["path1", "path2"] }</task>
|
|
2240
|
+
|
|
2241
|
+
<example>
|
|
2242
|
+
If items at indices 3 and 5 are duplicates, return: { "keys": ["items.3", "items.5"] }
|
|
2243
|
+
</example>`;
|
|
2244
|
+
return { system, user };
|
|
2245
|
+
};
|
|
2246
|
+
|
|
2247
|
+
// src/merge/SmartDataMerger.ts
|
|
2248
|
+
var isArraySchema = (schema) => {
|
|
2249
|
+
if (schema.type === "array") {
|
|
2250
|
+
return true;
|
|
2251
|
+
}
|
|
2252
|
+
return false;
|
|
2253
|
+
};
|
|
2254
|
+
var isObjectSchema = (schema) => {
|
|
2255
|
+
return schema.type === "object" && typeof schema.properties === "object";
|
|
2256
|
+
};
|
|
2257
|
+
var SmartDataMerger = class {
|
|
2258
|
+
schema;
|
|
2259
|
+
constructor(schema) {
|
|
2260
|
+
this.schema = schema;
|
|
2261
|
+
}
|
|
2262
|
+
merge(currentData, newData) {
|
|
2263
|
+
const merged = { ...currentData };
|
|
2264
|
+
const properties = this.schema.properties ?? {};
|
|
2265
|
+
for (const [key, propSchema] of Object.entries(properties)) {
|
|
2266
|
+
const currentValue = currentData[key];
|
|
2267
|
+
const newValue = newData[key];
|
|
2268
|
+
if (isArraySchema(propSchema)) {
|
|
2269
|
+
merged[key] = [
|
|
2270
|
+
...Array.isArray(currentValue) ? currentValue : [],
|
|
2271
|
+
...Array.isArray(newValue) ? newValue : []
|
|
2272
|
+
];
|
|
2273
|
+
continue;
|
|
2274
|
+
}
|
|
2275
|
+
if (isObjectSchema(propSchema)) {
|
|
2276
|
+
merged[key] = {
|
|
2277
|
+
...typeof currentValue === "object" && currentValue ? currentValue : {},
|
|
2278
|
+
...typeof newValue === "object" && newValue ? newValue : {}
|
|
2279
|
+
};
|
|
2280
|
+
continue;
|
|
2281
|
+
}
|
|
2282
|
+
if (newValue !== void 0 && newValue !== null && newValue !== "") {
|
|
2283
|
+
merged[key] = newValue;
|
|
2284
|
+
} else if (currentValue !== void 0) {
|
|
2285
|
+
merged[key] = currentValue;
|
|
2286
|
+
}
|
|
2287
|
+
}
|
|
2288
|
+
return merged;
|
|
2289
|
+
}
|
|
2290
|
+
};
|
|
2291
|
+
|
|
2292
|
+
// src/merge/Deduplicator.ts
|
|
2293
|
+
var fnv1a32 = (str) => {
|
|
2294
|
+
let hash = 2166136261;
|
|
2295
|
+
for (let i = 0; i < str.length; i++) {
|
|
2296
|
+
hash ^= str.charCodeAt(i);
|
|
2297
|
+
hash = Math.imul(hash, 16777619);
|
|
2298
|
+
}
|
|
2299
|
+
return hash >>> 0;
|
|
2300
|
+
};
|
|
2301
|
+
var stableStringify = (value) => {
|
|
2302
|
+
if (value === null || typeof value !== "object") {
|
|
2303
|
+
return JSON.stringify(value);
|
|
2304
|
+
}
|
|
2305
|
+
if (Array.isArray(value)) {
|
|
2306
|
+
return `[${value.map((item) => stableStringify(item)).join(",")}]`;
|
|
2307
|
+
}
|
|
2308
|
+
const entries = Object.entries(value).sort(([a], [b]) => a.localeCompare(b)).map(([key, val]) => `"${key}":${stableStringify(val)}`);
|
|
2309
|
+
return `{${entries.join(",")}}`;
|
|
2310
|
+
};
|
|
2311
|
+
var findExactDuplicatesWithHashing = (items) => {
|
|
2312
|
+
const seen = /* @__PURE__ */ new Map();
|
|
2313
|
+
const duplicates = [];
|
|
2314
|
+
items.forEach((item, index) => {
|
|
2315
|
+
const hash = fnv1a32(stableStringify(item));
|
|
2316
|
+
if (seen.has(hash)) {
|
|
2317
|
+
duplicates.push(index);
|
|
2318
|
+
return;
|
|
2319
|
+
}
|
|
2320
|
+
seen.set(hash, index);
|
|
2321
|
+
});
|
|
2322
|
+
return duplicates;
|
|
2323
|
+
};
|
|
2324
|
+
var deduplicateByIndices = (items, indices) => {
|
|
2325
|
+
const remove = new Set(indices);
|
|
2326
|
+
return items.filter((_, index) => !remove.has(index));
|
|
2327
|
+
};
|
|
2328
|
+
|
|
2329
|
+
// src/strategies/ParallelAutoMergeStrategy.ts
|
|
2330
|
+
var dedupeSchema = {
|
|
2331
|
+
type: "object",
|
|
2332
|
+
properties: {
|
|
2333
|
+
keys: { type: "array", items: { type: "string" } }
|
|
2334
|
+
},
|
|
2335
|
+
required: ["keys"],
|
|
2336
|
+
additionalProperties: false
|
|
2337
|
+
};
|
|
2338
|
+
var dedupeArrays = (data) => {
|
|
2339
|
+
const result = { ...data };
|
|
2340
|
+
for (const [key, value] of Object.entries(result)) {
|
|
2341
|
+
if (Array.isArray(value)) {
|
|
2342
|
+
const duplicates = findExactDuplicatesWithHashing(value);
|
|
2343
|
+
result[key] = deduplicateByIndices(value, duplicates);
|
|
2344
|
+
}
|
|
2345
|
+
}
|
|
2346
|
+
return result;
|
|
2347
|
+
};
|
|
2348
|
+
var removeByPath = (data, path5) => {
|
|
2349
|
+
const [root, indexStr] = path5.split(".");
|
|
2350
|
+
const index = Number(indexStr);
|
|
2351
|
+
if (!root || Number.isNaN(index)) {
|
|
2352
|
+
return data;
|
|
2353
|
+
}
|
|
2354
|
+
const value = data[root];
|
|
2355
|
+
if (!Array.isArray(value)) {
|
|
2356
|
+
return data;
|
|
2357
|
+
}
|
|
2358
|
+
const next = [...value];
|
|
2359
|
+
next.splice(index, 1);
|
|
2360
|
+
return { ...data, [root]: next };
|
|
2361
|
+
};
|
|
2362
|
+
var ParallelAutoMergeStrategy = class {
|
|
2363
|
+
name = "parallel-auto-merge";
|
|
2364
|
+
config;
|
|
2365
|
+
constructor(config) {
|
|
2366
|
+
this.config = config;
|
|
2367
|
+
}
|
|
2368
|
+
getEstimatedSteps(artifacts) {
|
|
2369
|
+
const batches = getBatches(artifacts, {
|
|
2370
|
+
maxTokens: this.config.chunkSize,
|
|
2371
|
+
maxImages: this.config.maxImages
|
|
2372
|
+
});
|
|
2373
|
+
return batches.length + 3;
|
|
2374
|
+
}
|
|
2375
|
+
async run(options) {
|
|
2376
|
+
const debug = options.debug;
|
|
2377
|
+
const { telemetry } = options;
|
|
2378
|
+
const strategySpan = telemetry?.startSpan({
|
|
2379
|
+
name: "strategy.parallel-auto-merge",
|
|
2380
|
+
kind: "CHAIN",
|
|
2381
|
+
attributes: {
|
|
2382
|
+
"strategy.name": this.name,
|
|
2383
|
+
"strategy.artifacts.count": options.artifacts.length,
|
|
2384
|
+
"strategy.chunk_size": this.config.chunkSize,
|
|
2385
|
+
"strategy.concurrency": this.config.concurrency
|
|
2386
|
+
}
|
|
2387
|
+
});
|
|
2388
|
+
const batches = getBatches(
|
|
2389
|
+
options.artifacts,
|
|
2390
|
+
{
|
|
2391
|
+
maxTokens: this.config.chunkSize,
|
|
2392
|
+
maxImages: this.config.maxImages
|
|
2393
|
+
},
|
|
2394
|
+
debug,
|
|
2395
|
+
telemetry ?? void 0,
|
|
2396
|
+
strategySpan
|
|
2397
|
+
);
|
|
2398
|
+
const schema = serializeSchema(options.schema);
|
|
2399
|
+
const totalSteps = this.getEstimatedSteps(options.artifacts);
|
|
2400
|
+
let step = 1;
|
|
2401
|
+
const tasks = batches.map((batch, index) => async () => {
|
|
2402
|
+
const prompt = buildExtractorPrompt(
|
|
2403
|
+
batch,
|
|
2404
|
+
schema,
|
|
2405
|
+
this.config.outputInstructions
|
|
2406
|
+
);
|
|
2407
|
+
const result = await extractWithPrompt({
|
|
2408
|
+
model: this.config.model,
|
|
2409
|
+
schema: options.schema,
|
|
2410
|
+
system: prompt.system,
|
|
2411
|
+
user: prompt.user,
|
|
2412
|
+
artifacts: batch,
|
|
2413
|
+
events: options.events,
|
|
2414
|
+
execute: this.config.execute,
|
|
2415
|
+
strict: options.strict ?? this.config.strict,
|
|
2416
|
+
debug,
|
|
2417
|
+
callId: `parallel_auto_batch_${index + 1}`,
|
|
2418
|
+
telemetry: telemetry ?? void 0,
|
|
2419
|
+
parentSpan: strategySpan
|
|
2420
|
+
});
|
|
2421
|
+
step += 1;
|
|
2422
|
+
await options.events?.onStep?.({
|
|
2423
|
+
step,
|
|
2424
|
+
total: totalSteps,
|
|
2425
|
+
label: `batch ${index + 1}/${batches.length}`
|
|
2426
|
+
});
|
|
2427
|
+
debug?.step({
|
|
2428
|
+
step,
|
|
2429
|
+
total: totalSteps,
|
|
2430
|
+
label: `batch ${index + 1}/${batches.length}`,
|
|
2431
|
+
strategy: this.name
|
|
2432
|
+
});
|
|
2433
|
+
return result;
|
|
2434
|
+
});
|
|
2435
|
+
const results = await runConcurrently(
|
|
2436
|
+
tasks,
|
|
2437
|
+
this.config.concurrency ?? batches.length
|
|
2438
|
+
);
|
|
2439
|
+
const merger = new SmartDataMerger(
|
|
2440
|
+
options.schema
|
|
2441
|
+
);
|
|
2442
|
+
let merged = {};
|
|
2443
|
+
debug?.mergeStart({
|
|
2444
|
+
mergeId: "parallel_auto_smart_merge",
|
|
2445
|
+
inputCount: results.length,
|
|
2446
|
+
strategy: this.name
|
|
2447
|
+
});
|
|
2448
|
+
const mergeSpan = telemetry?.startSpan({
|
|
2449
|
+
name: "struktur.smart_merge",
|
|
2450
|
+
kind: "CHAIN",
|
|
2451
|
+
parentSpan: strategySpan,
|
|
2452
|
+
attributes: {
|
|
2453
|
+
"merge.strategy": "smart",
|
|
2454
|
+
"merge.input_count": results.length
|
|
2455
|
+
}
|
|
2456
|
+
});
|
|
2457
|
+
for (let i = 0; i < results.length; i++) {
|
|
2458
|
+
const result = results[i];
|
|
2459
|
+
const prevSize = Object.keys(merged).length;
|
|
2460
|
+
merged = merger.merge(merged, result.data);
|
|
2461
|
+
const newSize = Object.keys(merged).length;
|
|
2462
|
+
for (const key of Object.keys(result.data)) {
|
|
2463
|
+
const leftArray = Array.isArray(merged[key]) ? merged[key].length : void 0;
|
|
2464
|
+
const rightArray = Array.isArray(
|
|
2465
|
+
result.data[key]
|
|
2466
|
+
) ? result.data[key].length : void 0;
|
|
2467
|
+
debug?.smartMergeField({
|
|
2468
|
+
mergeId: "parallel_auto_smart_merge",
|
|
2469
|
+
field: key,
|
|
2470
|
+
operation: "merge_arrays",
|
|
2471
|
+
leftCount: leftArray,
|
|
2472
|
+
rightCount: rightArray
|
|
2473
|
+
});
|
|
2474
|
+
if (mergeSpan && telemetry) {
|
|
2475
|
+
telemetry.recordEvent(mergeSpan, {
|
|
2476
|
+
type: "merge",
|
|
2477
|
+
strategy: "smart",
|
|
2478
|
+
inputCount: rightArray ?? 1,
|
|
2479
|
+
outputCount: leftArray ?? 1
|
|
2480
|
+
});
|
|
2481
|
+
}
|
|
2482
|
+
}
|
|
2483
|
+
}
|
|
2484
|
+
debug?.mergeComplete({
|
|
2485
|
+
mergeId: "parallel_auto_smart_merge",
|
|
2486
|
+
success: true
|
|
2487
|
+
});
|
|
2488
|
+
if (mergeSpan && telemetry) {
|
|
2489
|
+
telemetry.endSpan(mergeSpan, {
|
|
2490
|
+
status: "ok",
|
|
2491
|
+
output: merged
|
|
2492
|
+
});
|
|
2493
|
+
}
|
|
2494
|
+
merged = dedupeArrays(merged);
|
|
2495
|
+
const exactDedupeSpan = telemetry?.startSpan({
|
|
2496
|
+
name: "struktur.exact_dedupe",
|
|
2497
|
+
kind: "CHAIN",
|
|
2498
|
+
parentSpan: strategySpan,
|
|
2499
|
+
attributes: {
|
|
2500
|
+
"dedupe.method": "exact_hashing"
|
|
2501
|
+
}
|
|
2502
|
+
});
|
|
2503
|
+
if (exactDedupeSpan && telemetry) {
|
|
2504
|
+
telemetry.recordEvent(exactDedupeSpan, {
|
|
2505
|
+
type: "merge",
|
|
2506
|
+
strategy: "exact_hash_dedupe",
|
|
2507
|
+
inputCount: Object.keys(merged).length,
|
|
2508
|
+
outputCount: Object.keys(merged).length
|
|
2509
|
+
});
|
|
2510
|
+
telemetry.endSpan(exactDedupeSpan, {
|
|
2511
|
+
status: "ok",
|
|
2512
|
+
output: merged
|
|
2513
|
+
});
|
|
2514
|
+
}
|
|
2515
|
+
const dedupePrompt = buildDeduplicationPrompt(schema, merged);
|
|
2516
|
+
debug?.dedupeStart({
|
|
2517
|
+
dedupeId: "parallel_auto_dedupe",
|
|
2518
|
+
itemCount: Object.keys(merged).length
|
|
2519
|
+
});
|
|
2520
|
+
const llmDedupeSpan = telemetry?.startSpan({
|
|
2521
|
+
name: "struktur.llm_dedupe",
|
|
2522
|
+
kind: "CHAIN",
|
|
2523
|
+
parentSpan: strategySpan,
|
|
2524
|
+
attributes: {
|
|
2525
|
+
"dedupe.method": "llm"
|
|
2526
|
+
}
|
|
2527
|
+
});
|
|
2528
|
+
const dedupeResponse = await runWithRetries({
|
|
2529
|
+
model: this.config.dedupeModel ?? this.config.model,
|
|
2530
|
+
schema: dedupeSchema,
|
|
2531
|
+
system: dedupePrompt.system,
|
|
2532
|
+
user: dedupePrompt.user,
|
|
2533
|
+
events: options.events,
|
|
2534
|
+
execute: this.config.dedupeExecute,
|
|
2535
|
+
strict: this.config.strict,
|
|
2536
|
+
debug,
|
|
2537
|
+
callId: "parallel_auto_dedupe",
|
|
2538
|
+
telemetry: telemetry ?? void 0,
|
|
2539
|
+
parentSpan: llmDedupeSpan
|
|
2540
|
+
});
|
|
2541
|
+
step += 1;
|
|
2542
|
+
await options.events?.onStep?.({
|
|
2543
|
+
step,
|
|
2544
|
+
total: totalSteps,
|
|
2545
|
+
label: "dedupe"
|
|
2546
|
+
});
|
|
2547
|
+
debug?.step({
|
|
2548
|
+
step,
|
|
2549
|
+
total: totalSteps,
|
|
2550
|
+
label: "dedupe",
|
|
2551
|
+
strategy: this.name
|
|
2552
|
+
});
|
|
2553
|
+
let deduped = merged;
|
|
2554
|
+
for (const key of dedupeResponse.data.keys) {
|
|
2555
|
+
deduped = removeByPath(deduped, key);
|
|
2556
|
+
}
|
|
2557
|
+
debug?.dedupeComplete({
|
|
2558
|
+
dedupeId: "parallel_auto_dedupe",
|
|
2559
|
+
duplicatesFound: dedupeResponse.data.keys.length,
|
|
2560
|
+
itemsRemoved: dedupeResponse.data.keys.length
|
|
2561
|
+
});
|
|
2562
|
+
if (llmDedupeSpan && telemetry) {
|
|
2563
|
+
telemetry.recordEvent(llmDedupeSpan, {
|
|
2564
|
+
type: "merge",
|
|
2565
|
+
strategy: "llm_dedupe",
|
|
2566
|
+
inputCount: Object.keys(merged).length,
|
|
2567
|
+
outputCount: Object.keys(deduped).length,
|
|
2568
|
+
deduped: dedupeResponse.data.keys.length
|
|
2569
|
+
});
|
|
2570
|
+
telemetry.endSpan(llmDedupeSpan, {
|
|
2571
|
+
status: "ok",
|
|
2572
|
+
output: deduped
|
|
2573
|
+
});
|
|
2574
|
+
}
|
|
2575
|
+
telemetry?.endSpan(strategySpan, {
|
|
2576
|
+
status: "ok",
|
|
2577
|
+
output: deduped
|
|
2578
|
+
});
|
|
2579
|
+
return {
|
|
2580
|
+
data: deduped,
|
|
2581
|
+
usage: mergeUsage([...results.map((r) => r.usage), dedupeResponse.usage])
|
|
2582
|
+
};
|
|
2583
|
+
}
|
|
2584
|
+
};
|
|
2585
|
+
var parallelAutoMerge = (config) => {
|
|
2586
|
+
return new ParallelAutoMergeStrategy(config);
|
|
2587
|
+
};
|
|
2588
|
+
|
|
2589
|
+
// src/strategies/SequentialAutoMergeStrategy.ts
|
|
2590
|
+
var dedupeSchema2 = {
|
|
2591
|
+
type: "object",
|
|
2592
|
+
properties: {
|
|
2593
|
+
keys: { type: "array", items: { type: "string" } }
|
|
2594
|
+
},
|
|
2595
|
+
required: ["keys"],
|
|
2596
|
+
additionalProperties: false
|
|
2597
|
+
};
|
|
2598
|
+
var dedupeArrays2 = (data) => {
|
|
2599
|
+
const result = { ...data };
|
|
2600
|
+
for (const [key, value] of Object.entries(result)) {
|
|
2601
|
+
if (Array.isArray(value)) {
|
|
2602
|
+
const duplicates = findExactDuplicatesWithHashing(value);
|
|
2603
|
+
result[key] = deduplicateByIndices(value, duplicates);
|
|
2604
|
+
}
|
|
2605
|
+
}
|
|
2606
|
+
return result;
|
|
2607
|
+
};
|
|
2608
|
+
var removeByPath2 = (data, path5) => {
|
|
2609
|
+
const [root, indexStr] = path5.split(".");
|
|
2610
|
+
const index = Number(indexStr);
|
|
2611
|
+
if (!root || Number.isNaN(index)) {
|
|
2612
|
+
return data;
|
|
2613
|
+
}
|
|
2614
|
+
const value = data[root];
|
|
2615
|
+
if (!Array.isArray(value)) {
|
|
2616
|
+
return data;
|
|
2617
|
+
}
|
|
2618
|
+
const next = [...value];
|
|
2619
|
+
next.splice(index, 1);
|
|
2620
|
+
return { ...data, [root]: next };
|
|
2621
|
+
};
|
|
2622
|
+
var SequentialAutoMergeStrategy = class {
|
|
2623
|
+
name = "sequential-auto-merge";
|
|
2624
|
+
config;
|
|
2625
|
+
constructor(config) {
|
|
2626
|
+
this.config = config;
|
|
2627
|
+
}
|
|
2628
|
+
getEstimatedSteps(artifacts) {
|
|
2629
|
+
const batches = getBatches(artifacts, {
|
|
2630
|
+
maxTokens: this.config.chunkSize,
|
|
2631
|
+
maxImages: this.config.maxImages
|
|
2632
|
+
});
|
|
2633
|
+
return batches.length + 3;
|
|
2634
|
+
}
|
|
2635
|
+
async run(options) {
|
|
2636
|
+
const debug = options.debug;
|
|
2637
|
+
const { telemetry } = options;
|
|
2638
|
+
const strategySpan = telemetry?.startSpan({
|
|
2639
|
+
name: "strategy.sequential-auto-merge",
|
|
2640
|
+
kind: "CHAIN",
|
|
2641
|
+
attributes: {
|
|
2642
|
+
"strategy.name": this.name,
|
|
2643
|
+
"strategy.artifacts.count": options.artifacts.length,
|
|
2644
|
+
"strategy.chunk_size": this.config.chunkSize
|
|
2645
|
+
}
|
|
2646
|
+
});
|
|
2647
|
+
const batches = getBatches(
|
|
2648
|
+
options.artifacts,
|
|
2649
|
+
{
|
|
2650
|
+
maxTokens: this.config.chunkSize,
|
|
2651
|
+
maxImages: this.config.maxImages
|
|
2652
|
+
},
|
|
2653
|
+
debug,
|
|
2654
|
+
telemetry ?? void 0,
|
|
2655
|
+
strategySpan
|
|
2656
|
+
);
|
|
2657
|
+
const schema = serializeSchema(options.schema);
|
|
2658
|
+
const merger = new SmartDataMerger(
|
|
2659
|
+
options.schema
|
|
2660
|
+
);
|
|
2661
|
+
let merged = {};
|
|
2662
|
+
const usages = [];
|
|
2663
|
+
const totalSteps = this.getEstimatedSteps(options.artifacts);
|
|
2664
|
+
let step = 1;
|
|
2665
|
+
debug?.mergeStart({
|
|
2666
|
+
mergeId: "sequential_auto_merge",
|
|
2667
|
+
inputCount: batches.length,
|
|
2668
|
+
strategy: this.name
|
|
2669
|
+
});
|
|
2670
|
+
const mergeSpan = telemetry?.startSpan({
|
|
2671
|
+
name: "struktur.smart_merge",
|
|
2672
|
+
kind: "CHAIN",
|
|
2673
|
+
parentSpan: strategySpan,
|
|
2674
|
+
attributes: {
|
|
2675
|
+
"merge.strategy": "smart",
|
|
2676
|
+
"merge.input_count": batches.length
|
|
2677
|
+
}
|
|
2678
|
+
});
|
|
2679
|
+
for (const [index, batch] of batches.entries()) {
|
|
2680
|
+
const prompt = buildExtractorPrompt(
|
|
2681
|
+
batch,
|
|
2682
|
+
schema,
|
|
2683
|
+
this.config.outputInstructions
|
|
2684
|
+
);
|
|
2685
|
+
const result = await extractWithPrompt({
|
|
2686
|
+
model: this.config.model,
|
|
2687
|
+
schema: options.schema,
|
|
2688
|
+
system: prompt.system,
|
|
2689
|
+
user: prompt.user,
|
|
2690
|
+
artifacts: batch,
|
|
2691
|
+
events: options.events,
|
|
2692
|
+
execute: this.config.execute,
|
|
2693
|
+
strict: options.strict ?? this.config.strict,
|
|
2694
|
+
debug,
|
|
2695
|
+
callId: `sequential_auto_batch_${index + 1}`,
|
|
2696
|
+
telemetry: telemetry ?? void 0,
|
|
2697
|
+
parentSpan: mergeSpan
|
|
2698
|
+
});
|
|
2699
|
+
merged = merger.merge(merged, result.data);
|
|
2700
|
+
usages.push(result.usage);
|
|
2701
|
+
for (const key of Object.keys(result.data)) {
|
|
2702
|
+
const leftArray = Array.isArray(merged[key]) ? merged[key].length : void 0;
|
|
2703
|
+
const rightArray = Array.isArray(
|
|
2704
|
+
result.data[key]
|
|
2705
|
+
) ? result.data[key].length : void 0;
|
|
2706
|
+
debug?.smartMergeField({
|
|
2707
|
+
mergeId: "sequential_auto_merge",
|
|
2708
|
+
field: key,
|
|
2709
|
+
operation: "merge_arrays",
|
|
2710
|
+
leftCount: leftArray,
|
|
2711
|
+
rightCount: rightArray
|
|
2712
|
+
});
|
|
2713
|
+
if (mergeSpan && telemetry) {
|
|
2714
|
+
telemetry.recordEvent(mergeSpan, {
|
|
2715
|
+
type: "merge",
|
|
2716
|
+
strategy: "smart",
|
|
2717
|
+
inputCount: rightArray ?? 1,
|
|
2718
|
+
outputCount: leftArray ?? 1
|
|
2719
|
+
});
|
|
2720
|
+
}
|
|
2721
|
+
}
|
|
2722
|
+
step += 1;
|
|
2723
|
+
await options.events?.onStep?.({
|
|
2724
|
+
step,
|
|
2725
|
+
total: totalSteps,
|
|
2726
|
+
label: `batch ${index + 1}/${batches.length}`
|
|
2727
|
+
});
|
|
2728
|
+
debug?.step({
|
|
2729
|
+
step,
|
|
2730
|
+
total: totalSteps,
|
|
2731
|
+
label: `batch ${index + 1}/${batches.length}`,
|
|
2732
|
+
strategy: this.name
|
|
2733
|
+
});
|
|
2734
|
+
}
|
|
2735
|
+
debug?.mergeComplete({ mergeId: "sequential_auto_merge", success: true });
|
|
2736
|
+
if (mergeSpan && telemetry) {
|
|
2737
|
+
telemetry.endSpan(mergeSpan, {
|
|
2738
|
+
status: "ok",
|
|
2739
|
+
output: merged
|
|
2740
|
+
});
|
|
2741
|
+
}
|
|
2742
|
+
merged = dedupeArrays2(merged);
|
|
2743
|
+
const exactDedupeSpan = telemetry?.startSpan({
|
|
2744
|
+
name: "struktur.exact_dedupe",
|
|
2745
|
+
kind: "CHAIN",
|
|
2746
|
+
parentSpan: strategySpan,
|
|
2747
|
+
attributes: {
|
|
2748
|
+
"dedupe.method": "exact_hashing"
|
|
2749
|
+
}
|
|
2750
|
+
});
|
|
2751
|
+
if (exactDedupeSpan && telemetry) {
|
|
2752
|
+
telemetry.recordEvent(exactDedupeSpan, {
|
|
2753
|
+
type: "merge",
|
|
2754
|
+
strategy: "exact_hash_dedupe",
|
|
2755
|
+
inputCount: Object.keys(merged).length,
|
|
2756
|
+
outputCount: Object.keys(merged).length
|
|
2757
|
+
});
|
|
2758
|
+
telemetry.endSpan(exactDedupeSpan, {
|
|
2759
|
+
status: "ok",
|
|
2760
|
+
output: merged
|
|
2761
|
+
});
|
|
2762
|
+
}
|
|
2763
|
+
const dedupePrompt = buildDeduplicationPrompt(schema, merged);
|
|
2764
|
+
debug?.dedupeStart({
|
|
2765
|
+
dedupeId: "sequential_auto_dedupe",
|
|
2766
|
+
itemCount: Object.keys(merged).length
|
|
2767
|
+
});
|
|
2768
|
+
const llmDedupeSpan = telemetry?.startSpan({
|
|
2769
|
+
name: "struktur.llm_dedupe",
|
|
2770
|
+
kind: "CHAIN",
|
|
2771
|
+
parentSpan: strategySpan,
|
|
2772
|
+
attributes: {
|
|
2773
|
+
"dedupe.method": "llm"
|
|
2774
|
+
}
|
|
2775
|
+
});
|
|
2776
|
+
const dedupeResponse = await runWithRetries({
|
|
2777
|
+
model: this.config.dedupeModel ?? this.config.model,
|
|
2778
|
+
schema: dedupeSchema2,
|
|
2779
|
+
system: dedupePrompt.system,
|
|
2780
|
+
user: dedupePrompt.user,
|
|
2781
|
+
events: options.events,
|
|
2782
|
+
execute: this.config.dedupeExecute,
|
|
2783
|
+
strict: this.config.strict,
|
|
2784
|
+
debug,
|
|
2785
|
+
callId: "sequential_auto_dedupe",
|
|
2786
|
+
telemetry: telemetry ?? void 0,
|
|
2787
|
+
parentSpan: llmDedupeSpan
|
|
2788
|
+
});
|
|
2789
|
+
step += 1;
|
|
2790
|
+
await options.events?.onStep?.({
|
|
2791
|
+
step,
|
|
2792
|
+
total: totalSteps,
|
|
2793
|
+
label: "dedupe"
|
|
2794
|
+
});
|
|
2795
|
+
debug?.step({
|
|
2796
|
+
step,
|
|
2797
|
+
total: totalSteps,
|
|
2798
|
+
label: "dedupe",
|
|
2799
|
+
strategy: this.name
|
|
2800
|
+
});
|
|
2801
|
+
let deduped = merged;
|
|
2802
|
+
for (const key of dedupeResponse.data.keys) {
|
|
2803
|
+
deduped = removeByPath2(deduped, key);
|
|
2804
|
+
}
|
|
2805
|
+
debug?.dedupeComplete({
|
|
2806
|
+
dedupeId: "sequential_auto_dedupe",
|
|
2807
|
+
duplicatesFound: dedupeResponse.data.keys.length,
|
|
2808
|
+
itemsRemoved: dedupeResponse.data.keys.length
|
|
2809
|
+
});
|
|
2810
|
+
if (llmDedupeSpan && telemetry) {
|
|
2811
|
+
telemetry.recordEvent(llmDedupeSpan, {
|
|
2812
|
+
type: "merge",
|
|
2813
|
+
strategy: "llm_dedupe",
|
|
2814
|
+
inputCount: Object.keys(merged).length,
|
|
2815
|
+
outputCount: Object.keys(deduped).length,
|
|
2816
|
+
deduped: dedupeResponse.data.keys.length
|
|
2817
|
+
});
|
|
2818
|
+
telemetry.endSpan(llmDedupeSpan, {
|
|
2819
|
+
status: "ok",
|
|
2820
|
+
output: deduped
|
|
2821
|
+
});
|
|
2822
|
+
}
|
|
2823
|
+
telemetry?.endSpan(strategySpan, {
|
|
2824
|
+
status: "ok",
|
|
2825
|
+
output: deduped
|
|
2826
|
+
});
|
|
2827
|
+
return {
|
|
2828
|
+
data: deduped,
|
|
2829
|
+
usage: mergeUsage([...usages, dedupeResponse.usage])
|
|
2830
|
+
};
|
|
2831
|
+
}
|
|
2832
|
+
};
|
|
2833
|
+
var sequentialAutoMerge = (config) => {
|
|
2834
|
+
return new SequentialAutoMergeStrategy(config);
|
|
2835
|
+
};
|
|
2836
|
+
|
|
2837
|
+
// src/strategies/DoublePassStrategy.ts
|
|
2838
|
+
var DoublePassStrategy = class {
|
|
2839
|
+
name = "double-pass";
|
|
2840
|
+
config;
|
|
2841
|
+
constructor(config) {
|
|
2842
|
+
this.config = config;
|
|
2843
|
+
}
|
|
2844
|
+
getEstimatedSteps(artifacts) {
|
|
2845
|
+
const batches = getBatches(artifacts, {
|
|
2846
|
+
maxTokens: this.config.chunkSize,
|
|
2847
|
+
maxImages: this.config.maxImages
|
|
2848
|
+
});
|
|
2849
|
+
return batches.length * 2 + 3;
|
|
2850
|
+
}
|
|
2851
|
+
async run(options) {
|
|
2852
|
+
const debug = options.debug;
|
|
2853
|
+
const { telemetry } = options;
|
|
2854
|
+
const strategySpan = telemetry?.startSpan({
|
|
2855
|
+
name: "strategy.double-pass",
|
|
2856
|
+
kind: "CHAIN",
|
|
2857
|
+
attributes: {
|
|
2858
|
+
"strategy.name": this.name,
|
|
2859
|
+
"strategy.artifacts.count": options.artifacts.length,
|
|
2860
|
+
"strategy.chunk_size": this.config.chunkSize,
|
|
2861
|
+
"strategy.concurrency": this.config.concurrency
|
|
2862
|
+
}
|
|
2863
|
+
});
|
|
2864
|
+
const batches = getBatches(
|
|
2865
|
+
options.artifacts,
|
|
2866
|
+
{
|
|
2867
|
+
maxTokens: this.config.chunkSize,
|
|
2868
|
+
maxImages: this.config.maxImages
|
|
2869
|
+
},
|
|
2870
|
+
debug,
|
|
2871
|
+
telemetry ?? void 0,
|
|
2872
|
+
strategySpan
|
|
2873
|
+
);
|
|
2874
|
+
const schema = serializeSchema(options.schema);
|
|
2875
|
+
const totalSteps = this.getEstimatedSteps(options.artifacts);
|
|
2876
|
+
let step = 1;
|
|
2877
|
+
const pass1Span = telemetry?.startSpan({
|
|
2878
|
+
name: "struktur.pass_1",
|
|
2879
|
+
kind: "CHAIN",
|
|
2880
|
+
parentSpan: strategySpan,
|
|
2881
|
+
attributes: {
|
|
2882
|
+
"pass.number": 1,
|
|
2883
|
+
"pass.type": "parallel_extraction"
|
|
2884
|
+
}
|
|
2885
|
+
});
|
|
2886
|
+
const tasks = batches.map((batch, index) => async () => {
|
|
2887
|
+
const prompt = buildExtractorPrompt(
|
|
2888
|
+
batch,
|
|
2889
|
+
schema,
|
|
2890
|
+
this.config.outputInstructions
|
|
2891
|
+
);
|
|
2892
|
+
const result = await extractWithPrompt({
|
|
2893
|
+
model: this.config.model,
|
|
2894
|
+
schema: options.schema,
|
|
2895
|
+
system: prompt.system,
|
|
2896
|
+
user: prompt.user,
|
|
2897
|
+
artifacts: batch,
|
|
2898
|
+
events: options.events,
|
|
2899
|
+
execute: this.config.execute,
|
|
2900
|
+
strict: options.strict ?? this.config.strict,
|
|
2901
|
+
debug,
|
|
2902
|
+
callId: `double_pass_1_batch_${index + 1}`,
|
|
2903
|
+
telemetry: telemetry ?? void 0,
|
|
2904
|
+
parentSpan: pass1Span
|
|
2905
|
+
});
|
|
2906
|
+
step += 1;
|
|
2907
|
+
await options.events?.onStep?.({
|
|
2908
|
+
step,
|
|
2909
|
+
total: totalSteps,
|
|
2910
|
+
label: `pass 1 batch ${index + 1}/${batches.length}`
|
|
2911
|
+
});
|
|
2912
|
+
debug?.step({
|
|
2913
|
+
step,
|
|
2914
|
+
total: totalSteps,
|
|
2915
|
+
label: `pass 1 batch ${index + 1}/${batches.length}`,
|
|
2916
|
+
strategy: this.name
|
|
2917
|
+
});
|
|
2918
|
+
return result;
|
|
2919
|
+
});
|
|
2920
|
+
const results = await runConcurrently(
|
|
2921
|
+
tasks,
|
|
2922
|
+
this.config.concurrency ?? batches.length
|
|
2923
|
+
);
|
|
2924
|
+
debug?.mergeStart({
|
|
2925
|
+
mergeId: "double_pass_1_merge",
|
|
2926
|
+
inputCount: results.length,
|
|
2927
|
+
strategy: this.name
|
|
2928
|
+
});
|
|
2929
|
+
const pass1MergeSpan = telemetry?.startSpan({
|
|
2930
|
+
name: "struktur.pass_1_merge",
|
|
2931
|
+
kind: "CHAIN",
|
|
2932
|
+
parentSpan: pass1Span,
|
|
2933
|
+
attributes: {
|
|
2934
|
+
"merge.strategy": "parallel",
|
|
2935
|
+
"merge.input_count": results.length
|
|
2936
|
+
}
|
|
2937
|
+
});
|
|
2938
|
+
const mergePrompt = buildParallelMergerPrompt(
|
|
2939
|
+
schema,
|
|
2940
|
+
results.map((r) => r.data)
|
|
2941
|
+
);
|
|
2942
|
+
const merged = await extractWithPrompt({
|
|
2943
|
+
model: this.config.mergeModel,
|
|
2944
|
+
schema: options.schema,
|
|
2945
|
+
system: mergePrompt.system,
|
|
2946
|
+
user: mergePrompt.user,
|
|
2947
|
+
artifacts: [],
|
|
2948
|
+
events: options.events,
|
|
2949
|
+
execute: this.config.execute,
|
|
2950
|
+
strict: this.config.strict,
|
|
2951
|
+
debug,
|
|
2952
|
+
callId: "double_pass_1_merge",
|
|
2953
|
+
telemetry: telemetry ?? void 0,
|
|
2954
|
+
parentSpan: pass1MergeSpan
|
|
2955
|
+
});
|
|
2956
|
+
step += 1;
|
|
2957
|
+
await options.events?.onStep?.({
|
|
2958
|
+
step,
|
|
2959
|
+
total: totalSteps,
|
|
2960
|
+
label: "pass 1 merge"
|
|
2961
|
+
});
|
|
2962
|
+
debug?.step({
|
|
2963
|
+
step,
|
|
2964
|
+
total: totalSteps,
|
|
2965
|
+
label: "pass 1 merge",
|
|
2966
|
+
strategy: this.name
|
|
2967
|
+
});
|
|
2968
|
+
debug?.mergeComplete({ mergeId: "double_pass_1_merge", success: true });
|
|
2969
|
+
if (pass1MergeSpan && telemetry) {
|
|
2970
|
+
telemetry.recordEvent(pass1MergeSpan, {
|
|
2971
|
+
type: "merge",
|
|
2972
|
+
strategy: "parallel",
|
|
2973
|
+
inputCount: results.length,
|
|
2974
|
+
outputCount: 1
|
|
2975
|
+
});
|
|
2976
|
+
telemetry.endSpan(pass1MergeSpan, {
|
|
2977
|
+
status: "ok",
|
|
2978
|
+
output: merged.data
|
|
2979
|
+
});
|
|
2980
|
+
}
|
|
2981
|
+
telemetry?.endSpan(pass1Span, {
|
|
2982
|
+
status: "ok",
|
|
2983
|
+
output: merged.data
|
|
2984
|
+
});
|
|
2985
|
+
const pass2Span = telemetry?.startSpan({
|
|
2986
|
+
name: "struktur.pass_2",
|
|
2987
|
+
kind: "CHAIN",
|
|
2988
|
+
parentSpan: strategySpan,
|
|
2989
|
+
attributes: {
|
|
2990
|
+
"pass.number": 2,
|
|
2991
|
+
"pass.type": "sequential_refinement"
|
|
2992
|
+
}
|
|
2993
|
+
});
|
|
2994
|
+
let currentData = merged.data;
|
|
2995
|
+
const usages = [...results.map((r) => r.usage), merged.usage];
|
|
2996
|
+
for (const [index, batch] of batches.entries()) {
|
|
2997
|
+
const prompt = buildSequentialPrompt(
|
|
2998
|
+
batch,
|
|
2999
|
+
schema,
|
|
3000
|
+
JSON.stringify(currentData),
|
|
3001
|
+
this.config.outputInstructions
|
|
3002
|
+
);
|
|
3003
|
+
const result = await extractWithPrompt({
|
|
3004
|
+
model: this.config.model,
|
|
3005
|
+
schema: options.schema,
|
|
3006
|
+
system: prompt.system,
|
|
3007
|
+
user: prompt.user,
|
|
3008
|
+
artifacts: batch,
|
|
3009
|
+
events: options.events,
|
|
3010
|
+
execute: this.config.execute,
|
|
3011
|
+
strict: this.config.strict,
|
|
3012
|
+
debug,
|
|
3013
|
+
callId: `double_pass_2_batch_${index + 1}`,
|
|
3014
|
+
telemetry: telemetry ?? void 0,
|
|
3015
|
+
parentSpan: pass2Span
|
|
3016
|
+
});
|
|
3017
|
+
currentData = result.data;
|
|
3018
|
+
usages.push(result.usage);
|
|
3019
|
+
step += 1;
|
|
3020
|
+
await options.events?.onStep?.({
|
|
3021
|
+
step,
|
|
3022
|
+
total: totalSteps,
|
|
3023
|
+
label: `pass 2 batch ${index + 1}/${batches.length}`
|
|
3024
|
+
});
|
|
3025
|
+
debug?.step({
|
|
3026
|
+
step,
|
|
3027
|
+
total: totalSteps,
|
|
3028
|
+
label: `pass 2 batch ${index + 1}/${batches.length}`,
|
|
3029
|
+
strategy: this.name
|
|
3030
|
+
});
|
|
3031
|
+
}
|
|
3032
|
+
telemetry?.endSpan(pass2Span, {
|
|
3033
|
+
status: "ok",
|
|
3034
|
+
output: currentData
|
|
3035
|
+
});
|
|
3036
|
+
telemetry?.endSpan(strategySpan, {
|
|
3037
|
+
status: "ok",
|
|
3038
|
+
output: currentData
|
|
3039
|
+
});
|
|
3040
|
+
return { data: currentData, usage: mergeUsage(usages) };
|
|
3041
|
+
}
|
|
3042
|
+
};
|
|
3043
|
+
var doublePass = (config) => {
|
|
3044
|
+
return new DoublePassStrategy(config);
|
|
3045
|
+
};
|
|
3046
|
+
|
|
3047
|
+
// src/strategies/DoublePassAutoMergeStrategy.ts
|
|
3048
|
+
var dedupeSchema3 = {
|
|
3049
|
+
type: "object",
|
|
3050
|
+
properties: {
|
|
3051
|
+
keys: { type: "array", items: { type: "string" } }
|
|
3052
|
+
},
|
|
3053
|
+
required: ["keys"],
|
|
3054
|
+
additionalProperties: false
|
|
3055
|
+
};
|
|
3056
|
+
var dedupeArrays3 = (data) => {
|
|
3057
|
+
const result = { ...data };
|
|
3058
|
+
for (const [key, value] of Object.entries(result)) {
|
|
3059
|
+
if (Array.isArray(value)) {
|
|
3060
|
+
const duplicates = findExactDuplicatesWithHashing(value);
|
|
3061
|
+
result[key] = deduplicateByIndices(value, duplicates);
|
|
3062
|
+
}
|
|
3063
|
+
}
|
|
3064
|
+
return result;
|
|
3065
|
+
};
|
|
3066
|
+
var removeByPath3 = (data, path5) => {
|
|
3067
|
+
const [root, indexStr] = path5.split(".");
|
|
3068
|
+
const index = Number(indexStr);
|
|
3069
|
+
if (!root || Number.isNaN(index)) {
|
|
3070
|
+
return data;
|
|
3071
|
+
}
|
|
3072
|
+
const value = data[root];
|
|
3073
|
+
if (!Array.isArray(value)) {
|
|
3074
|
+
return data;
|
|
3075
|
+
}
|
|
3076
|
+
const next = [...value];
|
|
3077
|
+
next.splice(index, 1);
|
|
3078
|
+
return { ...data, [root]: next };
|
|
3079
|
+
};
|
|
3080
|
+
var DoublePassAutoMergeStrategy = class {
|
|
3081
|
+
name = "double-pass-auto-merge";
|
|
3082
|
+
config;
|
|
3083
|
+
constructor(config) {
|
|
3084
|
+
this.config = config;
|
|
3085
|
+
}
|
|
3086
|
+
getEstimatedSteps(artifacts) {
|
|
3087
|
+
const batches = getBatches(artifacts, {
|
|
3088
|
+
maxTokens: this.config.chunkSize,
|
|
3089
|
+
maxImages: this.config.maxImages
|
|
3090
|
+
});
|
|
3091
|
+
return batches.length * 2 + 3;
|
|
3092
|
+
}
|
|
3093
|
+
async run(options) {
|
|
3094
|
+
const debug = options.debug;
|
|
3095
|
+
const { telemetry } = options;
|
|
3096
|
+
const strategySpan = telemetry?.startSpan({
|
|
3097
|
+
name: "strategy.double-pass-auto-merge",
|
|
3098
|
+
kind: "CHAIN",
|
|
3099
|
+
attributes: {
|
|
3100
|
+
"strategy.name": this.name,
|
|
3101
|
+
"strategy.artifacts.count": options.artifacts.length,
|
|
3102
|
+
"strategy.chunk_size": this.config.chunkSize,
|
|
3103
|
+
"strategy.concurrency": this.config.concurrency
|
|
3104
|
+
}
|
|
3105
|
+
});
|
|
3106
|
+
const batches = getBatches(
|
|
3107
|
+
options.artifacts,
|
|
3108
|
+
{
|
|
3109
|
+
maxTokens: this.config.chunkSize,
|
|
3110
|
+
maxImages: this.config.maxImages
|
|
3111
|
+
},
|
|
3112
|
+
debug,
|
|
3113
|
+
telemetry ?? void 0,
|
|
3114
|
+
strategySpan
|
|
3115
|
+
);
|
|
3116
|
+
const schema = serializeSchema(options.schema);
|
|
3117
|
+
const totalSteps = this.getEstimatedSteps(options.artifacts);
|
|
3118
|
+
let step = 1;
|
|
3119
|
+
const pass1Span = telemetry?.startSpan({
|
|
3120
|
+
name: "struktur.pass_1",
|
|
3121
|
+
kind: "CHAIN",
|
|
3122
|
+
parentSpan: strategySpan,
|
|
3123
|
+
attributes: {
|
|
3124
|
+
"pass.number": 1,
|
|
3125
|
+
"pass.type": "parallel_extraction"
|
|
3126
|
+
}
|
|
3127
|
+
});
|
|
3128
|
+
const tasks = batches.map((batch, index) => async () => {
|
|
3129
|
+
const prompt = buildExtractorPrompt(
|
|
3130
|
+
batch,
|
|
3131
|
+
schema,
|
|
3132
|
+
this.config.outputInstructions
|
|
3133
|
+
);
|
|
3134
|
+
const result = await extractWithPrompt({
|
|
3135
|
+
model: this.config.model,
|
|
3136
|
+
schema: options.schema,
|
|
3137
|
+
system: prompt.system,
|
|
3138
|
+
user: prompt.user,
|
|
3139
|
+
artifacts: batch,
|
|
3140
|
+
events: options.events,
|
|
3141
|
+
execute: this.config.execute,
|
|
3142
|
+
strict: options.strict ?? this.config.strict,
|
|
3143
|
+
debug,
|
|
3144
|
+
callId: `double_pass_auto_1_batch_${index + 1}`,
|
|
3145
|
+
telemetry: telemetry ?? void 0,
|
|
3146
|
+
parentSpan: pass1Span
|
|
3147
|
+
});
|
|
3148
|
+
step += 1;
|
|
3149
|
+
await options.events?.onStep?.({
|
|
3150
|
+
step,
|
|
3151
|
+
total: totalSteps,
|
|
3152
|
+
label: `pass 1 batch ${index + 1}/${batches.length}`
|
|
3153
|
+
});
|
|
3154
|
+
debug?.step({
|
|
3155
|
+
step,
|
|
3156
|
+
total: totalSteps,
|
|
3157
|
+
label: `pass 1 batch ${index + 1}/${batches.length}`,
|
|
3158
|
+
strategy: this.name
|
|
3159
|
+
});
|
|
3160
|
+
return result;
|
|
3161
|
+
});
|
|
3162
|
+
const results = await runConcurrently(
|
|
3163
|
+
tasks,
|
|
3164
|
+
this.config.concurrency ?? batches.length
|
|
3165
|
+
);
|
|
3166
|
+
const merger = new SmartDataMerger(
|
|
3167
|
+
options.schema
|
|
3168
|
+
);
|
|
3169
|
+
let merged = {};
|
|
3170
|
+
debug?.mergeStart({
|
|
3171
|
+
mergeId: "double_pass_auto_merge",
|
|
3172
|
+
inputCount: results.length,
|
|
3173
|
+
strategy: this.name
|
|
3174
|
+
});
|
|
3175
|
+
const mergeSpan = telemetry?.startSpan({
|
|
3176
|
+
name: "struktur.smart_merge",
|
|
3177
|
+
kind: "CHAIN",
|
|
3178
|
+
parentSpan: pass1Span,
|
|
3179
|
+
attributes: {
|
|
3180
|
+
"merge.strategy": "smart",
|
|
3181
|
+
"merge.input_count": results.length
|
|
3182
|
+
}
|
|
3183
|
+
});
|
|
3184
|
+
for (let i = 0; i < results.length; i++) {
|
|
3185
|
+
const result = results[i];
|
|
3186
|
+
merged = merger.merge(merged, result.data);
|
|
3187
|
+
for (const key of Object.keys(result.data)) {
|
|
3188
|
+
const leftArray = Array.isArray(merged[key]) ? merged[key].length : void 0;
|
|
3189
|
+
const rightArray = Array.isArray(
|
|
3190
|
+
result.data[key]
|
|
3191
|
+
) ? result.data[key].length : void 0;
|
|
3192
|
+
debug?.smartMergeField({
|
|
3193
|
+
mergeId: "double_pass_auto_merge",
|
|
3194
|
+
field: key,
|
|
3195
|
+
operation: "merge_arrays",
|
|
3196
|
+
leftCount: leftArray,
|
|
3197
|
+
rightCount: rightArray
|
|
3198
|
+
});
|
|
3199
|
+
if (mergeSpan && telemetry) {
|
|
3200
|
+
telemetry.recordEvent(mergeSpan, {
|
|
3201
|
+
type: "merge",
|
|
3202
|
+
strategy: "smart",
|
|
3203
|
+
inputCount: rightArray ?? 1,
|
|
3204
|
+
outputCount: leftArray ?? 1
|
|
3205
|
+
});
|
|
3206
|
+
}
|
|
3207
|
+
}
|
|
3208
|
+
}
|
|
3209
|
+
debug?.mergeComplete({ mergeId: "double_pass_auto_merge", success: true });
|
|
3210
|
+
if (mergeSpan && telemetry) {
|
|
3211
|
+
telemetry.endSpan(mergeSpan, {
|
|
3212
|
+
status: "ok",
|
|
3213
|
+
output: merged
|
|
3214
|
+
});
|
|
3215
|
+
}
|
|
3216
|
+
merged = dedupeArrays3(merged);
|
|
3217
|
+
const exactDedupeSpan = telemetry?.startSpan({
|
|
3218
|
+
name: "struktur.exact_dedupe",
|
|
3219
|
+
kind: "CHAIN",
|
|
3220
|
+
parentSpan: pass1Span,
|
|
3221
|
+
attributes: {
|
|
3222
|
+
"dedupe.method": "exact_hashing"
|
|
3223
|
+
}
|
|
3224
|
+
});
|
|
3225
|
+
if (exactDedupeSpan && telemetry) {
|
|
3226
|
+
telemetry.recordEvent(exactDedupeSpan, {
|
|
3227
|
+
type: "merge",
|
|
3228
|
+
strategy: "exact_hash_dedupe",
|
|
3229
|
+
inputCount: Object.keys(merged).length,
|
|
3230
|
+
outputCount: Object.keys(merged).length
|
|
3231
|
+
});
|
|
3232
|
+
telemetry.endSpan(exactDedupeSpan, {
|
|
3233
|
+
status: "ok",
|
|
3234
|
+
output: merged
|
|
3235
|
+
});
|
|
3236
|
+
}
|
|
3237
|
+
const dedupePrompt = buildDeduplicationPrompt(schema, merged);
|
|
3238
|
+
debug?.dedupeStart({
|
|
3239
|
+
dedupeId: "double_pass_auto_dedupe",
|
|
3240
|
+
itemCount: Object.keys(merged).length
|
|
3241
|
+
});
|
|
3242
|
+
const llmDedupeSpan = telemetry?.startSpan({
|
|
3243
|
+
name: "struktur.llm_dedupe",
|
|
3244
|
+
kind: "CHAIN",
|
|
3245
|
+
parentSpan: pass1Span,
|
|
3246
|
+
attributes: {
|
|
3247
|
+
"dedupe.method": "llm"
|
|
3248
|
+
}
|
|
3249
|
+
});
|
|
3250
|
+
const dedupeResponse = await runWithRetries({
|
|
3251
|
+
model: this.config.dedupeModel ?? this.config.model,
|
|
3252
|
+
schema: dedupeSchema3,
|
|
3253
|
+
system: dedupePrompt.system,
|
|
3254
|
+
user: dedupePrompt.user,
|
|
3255
|
+
events: options.events,
|
|
3256
|
+
execute: this.config.dedupeExecute,
|
|
3257
|
+
strict: this.config.strict,
|
|
3258
|
+
debug,
|
|
3259
|
+
callId: "double_pass_auto_dedupe",
|
|
3260
|
+
telemetry: telemetry ?? void 0,
|
|
3261
|
+
parentSpan: llmDedupeSpan
|
|
3262
|
+
});
|
|
3263
|
+
step += 1;
|
|
3264
|
+
await options.events?.onStep?.({
|
|
3265
|
+
step,
|
|
3266
|
+
total: totalSteps,
|
|
3267
|
+
label: "pass 1 dedupe"
|
|
3268
|
+
});
|
|
3269
|
+
debug?.step({
|
|
3270
|
+
step,
|
|
3271
|
+
total: totalSteps,
|
|
3272
|
+
label: "pass 1 dedupe",
|
|
3273
|
+
strategy: this.name
|
|
3274
|
+
});
|
|
3275
|
+
let deduped = merged;
|
|
3276
|
+
for (const key of dedupeResponse.data.keys) {
|
|
3277
|
+
deduped = removeByPath3(deduped, key);
|
|
3278
|
+
}
|
|
3279
|
+
debug?.dedupeComplete({
|
|
3280
|
+
dedupeId: "double_pass_auto_dedupe",
|
|
3281
|
+
duplicatesFound: dedupeResponse.data.keys.length,
|
|
3282
|
+
itemsRemoved: dedupeResponse.data.keys.length
|
|
3283
|
+
});
|
|
3284
|
+
if (llmDedupeSpan && telemetry) {
|
|
3285
|
+
telemetry.recordEvent(llmDedupeSpan, {
|
|
3286
|
+
type: "merge",
|
|
3287
|
+
strategy: "llm_dedupe",
|
|
3288
|
+
inputCount: Object.keys(merged).length,
|
|
3289
|
+
outputCount: Object.keys(deduped).length,
|
|
3290
|
+
deduped: dedupeResponse.data.keys.length
|
|
3291
|
+
});
|
|
3292
|
+
telemetry.endSpan(llmDedupeSpan, {
|
|
3293
|
+
status: "ok",
|
|
3294
|
+
output: deduped
|
|
3295
|
+
});
|
|
3296
|
+
}
|
|
3297
|
+
telemetry?.endSpan(pass1Span, {
|
|
3298
|
+
status: "ok",
|
|
3299
|
+
output: deduped
|
|
3300
|
+
});
|
|
3301
|
+
let currentData = deduped;
|
|
3302
|
+
const usages = [...results.map((r) => r.usage), dedupeResponse.usage];
|
|
3303
|
+
const pass2Span = telemetry?.startSpan({
|
|
3304
|
+
name: "struktur.pass_2",
|
|
3305
|
+
kind: "CHAIN",
|
|
3306
|
+
parentSpan: strategySpan,
|
|
3307
|
+
attributes: {
|
|
3308
|
+
"pass.number": 2,
|
|
3309
|
+
"pass.type": "sequential_refinement"
|
|
3310
|
+
}
|
|
3311
|
+
});
|
|
3312
|
+
for (const [index, batch] of batches.entries()) {
|
|
3313
|
+
const prompt = buildSequentialPrompt(
|
|
3314
|
+
batch,
|
|
3315
|
+
schema,
|
|
3316
|
+
JSON.stringify(currentData),
|
|
3317
|
+
this.config.outputInstructions
|
|
3318
|
+
);
|
|
3319
|
+
const result = await extractWithPrompt({
|
|
3320
|
+
model: this.config.model,
|
|
3321
|
+
schema: options.schema,
|
|
3322
|
+
system: prompt.system,
|
|
3323
|
+
user: prompt.user,
|
|
3324
|
+
artifacts: batch,
|
|
3325
|
+
events: options.events,
|
|
3326
|
+
execute: this.config.execute,
|
|
3327
|
+
strict: this.config.strict,
|
|
3328
|
+
debug,
|
|
3329
|
+
callId: `double_pass_auto_2_batch_${index + 1}`,
|
|
3330
|
+
telemetry: telemetry ?? void 0,
|
|
3331
|
+
parentSpan: pass2Span
|
|
3332
|
+
});
|
|
3333
|
+
currentData = result.data;
|
|
3334
|
+
usages.push(result.usage);
|
|
3335
|
+
step += 1;
|
|
3336
|
+
await options.events?.onStep?.({
|
|
3337
|
+
step,
|
|
3338
|
+
total: totalSteps,
|
|
3339
|
+
label: `pass 2 batch ${index + 1}/${batches.length}`
|
|
3340
|
+
});
|
|
3341
|
+
debug?.step({
|
|
3342
|
+
step,
|
|
3343
|
+
total: totalSteps,
|
|
3344
|
+
label: `pass 2 batch ${index + 1}/${batches.length}`,
|
|
3345
|
+
strategy: this.name
|
|
3346
|
+
});
|
|
3347
|
+
}
|
|
3348
|
+
telemetry?.endSpan(pass2Span, {
|
|
3349
|
+
status: "ok",
|
|
3350
|
+
output: currentData
|
|
3351
|
+
});
|
|
3352
|
+
telemetry?.endSpan(strategySpan, {
|
|
3353
|
+
status: "ok",
|
|
3354
|
+
output: currentData
|
|
3355
|
+
});
|
|
3356
|
+
return { data: currentData, usage: mergeUsage(usages) };
|
|
3357
|
+
}
|
|
3358
|
+
};
|
|
3359
|
+
var doublePassAutoMerge = (config) => {
|
|
3360
|
+
return new DoublePassAutoMergeStrategy(config);
|
|
3361
|
+
};
|
|
3362
|
+
|
|
3363
|
+
// src/index.ts
|
|
3364
|
+
init_collect();
|
|
3365
|
+
init_pdf();
|
|
3366
|
+
import { AgentStrategy, agent } from "@struktur/agent-strategy";
|
|
3367
|
+
|
|
3368
|
+
// src/debug/logger.ts
|
|
3369
|
+
var createDebugLogger = (enabled) => {
|
|
3370
|
+
const log = (entry) => {
|
|
3371
|
+
if (!enabled) return;
|
|
3372
|
+
const timestamp = (/* @__PURE__ */ new Date()).toISOString();
|
|
3373
|
+
const logEntry = { timestamp, ...entry };
|
|
3374
|
+
process.stderr.write(JSON.stringify(logEntry) + "\n");
|
|
3375
|
+
};
|
|
3376
|
+
return {
|
|
3377
|
+
// CLI initialization
|
|
3378
|
+
cliInit: (data) => {
|
|
3379
|
+
log({ type: "cli_init", ...data });
|
|
3380
|
+
},
|
|
3381
|
+
schemaLoaded: (data) => {
|
|
3382
|
+
log({ type: "schema_loaded", ...data });
|
|
3383
|
+
},
|
|
3384
|
+
artifactsLoaded: (data) => {
|
|
3385
|
+
log({ type: "artifacts_loaded", ...data });
|
|
3386
|
+
},
|
|
3387
|
+
modelResolved: (data) => {
|
|
3388
|
+
log({ type: "model_resolved", ...data });
|
|
3389
|
+
},
|
|
3390
|
+
strategyCreated: (data) => {
|
|
3391
|
+
log({ type: "strategy_created", ...data });
|
|
3392
|
+
},
|
|
3393
|
+
// Chunking
|
|
3394
|
+
chunkingStart: (data) => {
|
|
3395
|
+
log({ type: "chunking_start", ...data });
|
|
3396
|
+
},
|
|
3397
|
+
chunkingSplit: (data) => {
|
|
3398
|
+
log({ type: "chunking_split", ...data });
|
|
3399
|
+
},
|
|
3400
|
+
chunkingResult: (data) => {
|
|
3401
|
+
log({ type: "chunking_result", ...data });
|
|
3402
|
+
},
|
|
3403
|
+
batchingStart: (data) => {
|
|
3404
|
+
log({ type: "batching_start", ...data });
|
|
3405
|
+
},
|
|
3406
|
+
batchCreated: (data) => {
|
|
3407
|
+
log({ type: "batch_created", ...data });
|
|
3408
|
+
},
|
|
3409
|
+
batchingComplete: (data) => {
|
|
3410
|
+
log({ type: "batching_complete", ...data });
|
|
3411
|
+
},
|
|
3412
|
+
// Strategy execution
|
|
3413
|
+
strategyRunStart: (data) => {
|
|
3414
|
+
log({ type: "strategy_run_start", ...data });
|
|
3415
|
+
},
|
|
3416
|
+
step: (data) => {
|
|
3417
|
+
log({ type: "step", ...data });
|
|
3418
|
+
},
|
|
3419
|
+
progress: (data) => {
|
|
3420
|
+
log({ type: "progress", ...data });
|
|
3421
|
+
},
|
|
3422
|
+
// LLM calls
|
|
3423
|
+
llmCallStart: (data) => {
|
|
3424
|
+
log({ type: "llm_call_start", ...data });
|
|
3425
|
+
},
|
|
3426
|
+
llmCallComplete: (data) => {
|
|
3427
|
+
log({ type: "llm_call_complete", ...data });
|
|
3428
|
+
},
|
|
3429
|
+
// Retry events
|
|
3430
|
+
retry: (data) => {
|
|
3431
|
+
log({ type: "retry", ...data });
|
|
3432
|
+
},
|
|
3433
|
+
// Validation
|
|
3434
|
+
validationStart: (data) => {
|
|
3435
|
+
log({ type: "validation_start", ...data });
|
|
3436
|
+
},
|
|
3437
|
+
validationSuccess: (data) => {
|
|
3438
|
+
log({ type: "validation_success", ...data });
|
|
3439
|
+
},
|
|
3440
|
+
validationFailed: (data) => {
|
|
3441
|
+
log({ type: "validation_failed", ...data });
|
|
3442
|
+
},
|
|
3443
|
+
// Merging
|
|
3444
|
+
mergeStart: (data) => {
|
|
3445
|
+
log({ type: "merge_start", ...data });
|
|
3446
|
+
},
|
|
3447
|
+
mergeComplete: (data) => {
|
|
3448
|
+
log({ type: "merge_complete", ...data });
|
|
3449
|
+
},
|
|
3450
|
+
// Deduplication
|
|
3451
|
+
dedupeStart: (data) => {
|
|
3452
|
+
log({ type: "dedupe_start", ...data });
|
|
3453
|
+
},
|
|
3454
|
+
dedupeComplete: (data) => {
|
|
3455
|
+
log({ type: "dedupe_complete", ...data });
|
|
3456
|
+
},
|
|
3457
|
+
// Token usage tracking
|
|
3458
|
+
tokenUsage: (data) => {
|
|
3459
|
+
log({ type: "token_usage", ...data });
|
|
3460
|
+
},
|
|
3461
|
+
// Results
|
|
3462
|
+
extractionComplete: (data) => {
|
|
3463
|
+
log({ type: "extraction_complete", ...data });
|
|
3464
|
+
},
|
|
3465
|
+
// Prompt details (verbose)
|
|
3466
|
+
promptSystem: (data) => {
|
|
3467
|
+
log({ type: "prompt_system", ...data });
|
|
3468
|
+
},
|
|
3469
|
+
promptUser: (data) => {
|
|
3470
|
+
log({ type: "prompt_user", ...data });
|
|
3471
|
+
},
|
|
3472
|
+
// Raw response
|
|
3473
|
+
rawResponse: (data) => {
|
|
3474
|
+
log({ type: "raw_response", ...data });
|
|
3475
|
+
},
|
|
3476
|
+
// Smart merge details
|
|
3477
|
+
smartMergeField: (data) => {
|
|
3478
|
+
log({ type: "smart_merge_field", ...data });
|
|
3479
|
+
}
|
|
3480
|
+
};
|
|
3481
|
+
};
|
|
3482
|
+
|
|
3483
|
+
// src/auth/tokens.ts
|
|
3484
|
+
import path3 from "path";
|
|
3485
|
+
import os2 from "os";
|
|
3486
|
+
import { chmod, mkdir, readFile as readFile3, writeFile as writeFile2, stat, access } from "fs/promises";
|
|
3487
|
+
import { execFile } from "child_process";
|
|
3488
|
+
import { promisify as promisify2 } from "util";
|
|
3489
|
+
var execFileAsync = promisify2(execFile);
|
|
3490
|
+
var CONFIG_DIR_ENV = "STRUKTUR_CONFIG_DIR";
|
|
3491
|
+
var DISABLE_KEYCHAIN_ENV = "STRUKTUR_DISABLE_KEYCHAIN";
|
|
3492
|
+
var SERVICE_ENV = "STRUKTUR_KEYCHAIN_SERVICE";
|
|
3493
|
+
var DEFAULT_SERVICE = "struktur";
|
|
3494
|
+
var resolveConfigDir = () => {
|
|
3495
|
+
return process.env[CONFIG_DIR_ENV] ?? path3.join(os2.homedir(), ".config", "struktur");
|
|
3496
|
+
};
|
|
3497
|
+
var resolveTokensPath = () => path3.join(resolveConfigDir(), "tokens.json");
|
|
3498
|
+
var emptyStore = () => ({ version: 1, providers: {} });
|
|
3499
|
+
var readTokenStore = async () => {
|
|
3500
|
+
const tokensPath = resolveTokensPath();
|
|
3501
|
+
try {
|
|
3502
|
+
await stat(tokensPath);
|
|
3503
|
+
} catch {
|
|
3504
|
+
return emptyStore();
|
|
3505
|
+
}
|
|
3506
|
+
const raw = await readFile3(tokensPath, "utf-8");
|
|
3507
|
+
const parsed = JSON.parse(raw);
|
|
3508
|
+
if (!parsed || parsed.version !== 1 || typeof parsed.providers !== "object") {
|
|
3509
|
+
return emptyStore();
|
|
3510
|
+
}
|
|
3511
|
+
return parsed;
|
|
3512
|
+
};
|
|
3513
|
+
var writeTokenStore = async (store) => {
|
|
3514
|
+
const configDir = resolveConfigDir();
|
|
3515
|
+
const tokensPath = resolveTokensPath();
|
|
3516
|
+
await mkdir(configDir, { recursive: true, mode: 448 });
|
|
3517
|
+
await writeFile2(tokensPath, JSON.stringify(store, null, 2));
|
|
3518
|
+
await chmod(configDir, 448);
|
|
3519
|
+
await chmod(tokensPath, 384);
|
|
3520
|
+
};
|
|
3521
|
+
var isKeychainAvailable = async () => {
|
|
3522
|
+
if (process.env[DISABLE_KEYCHAIN_ENV]) {
|
|
3523
|
+
return false;
|
|
3524
|
+
}
|
|
3525
|
+
if (process.platform !== "darwin") {
|
|
3526
|
+
return false;
|
|
3527
|
+
}
|
|
3528
|
+
try {
|
|
3529
|
+
await access("/usr/bin/security");
|
|
3530
|
+
return true;
|
|
3531
|
+
} catch {
|
|
3532
|
+
return false;
|
|
3533
|
+
}
|
|
3534
|
+
};
|
|
3535
|
+
var keychainService = () => process.env[SERVICE_ENV] ?? DEFAULT_SERVICE;
|
|
3536
|
+
var runSecurity = async (args) => {
|
|
3537
|
+
try {
|
|
3538
|
+
const { stdout } = await execFileAsync("/usr/bin/security", args);
|
|
3539
|
+
return stdout;
|
|
3540
|
+
} catch (error) {
|
|
3541
|
+
if (error instanceof Error && "stderr" in error) {
|
|
3542
|
+
const stderr = error.stderr;
|
|
3543
|
+
const message = stderr?.trim() || error.message;
|
|
3544
|
+
throw new Error(message);
|
|
3545
|
+
}
|
|
3546
|
+
throw error;
|
|
3547
|
+
}
|
|
3548
|
+
};
|
|
3549
|
+
var writeKeychainToken = async (provider, token) => {
|
|
3550
|
+
await runSecurity([
|
|
3551
|
+
"add-generic-password",
|
|
3552
|
+
"-a",
|
|
3553
|
+
provider,
|
|
3554
|
+
"-s",
|
|
3555
|
+
keychainService(),
|
|
3556
|
+
"-w",
|
|
3557
|
+
token,
|
|
3558
|
+
"-U"
|
|
3559
|
+
]);
|
|
3560
|
+
};
|
|
3561
|
+
var readKeychainToken = async (provider) => {
|
|
3562
|
+
const output = await runSecurity([
|
|
3563
|
+
"find-generic-password",
|
|
3564
|
+
"-a",
|
|
3565
|
+
provider,
|
|
3566
|
+
"-s",
|
|
3567
|
+
keychainService(),
|
|
3568
|
+
"-w"
|
|
3569
|
+
]);
|
|
3570
|
+
return output.trim();
|
|
3571
|
+
};
|
|
3572
|
+
var deleteKeychainToken = async (provider) => {
|
|
3573
|
+
await runSecurity([
|
|
3574
|
+
"delete-generic-password",
|
|
3575
|
+
"-a",
|
|
3576
|
+
provider,
|
|
3577
|
+
"-s",
|
|
3578
|
+
keychainService()
|
|
3579
|
+
]);
|
|
3580
|
+
};
|
|
3581
|
+
var listStoredProviders = async () => {
|
|
3582
|
+
const store = await readTokenStore();
|
|
3583
|
+
return Object.entries(store.providers).map(([provider, entry]) => ({
|
|
3584
|
+
provider,
|
|
3585
|
+
storage: entry.storage
|
|
3586
|
+
}));
|
|
3587
|
+
};
|
|
3588
|
+
var setProviderToken = async (provider, token, storage = "auto") => {
|
|
3589
|
+
const store = await readTokenStore();
|
|
3590
|
+
let resolvedStorage = "file";
|
|
3591
|
+
if (storage === "keychain") {
|
|
3592
|
+
if (!await isKeychainAvailable()) {
|
|
3593
|
+
throw new Error("Keychain is not available on this platform.");
|
|
3594
|
+
}
|
|
3595
|
+
resolvedStorage = "keychain";
|
|
3596
|
+
} else if (storage === "auto") {
|
|
3597
|
+
resolvedStorage = await isKeychainAvailable() ? "keychain" : "file";
|
|
3598
|
+
}
|
|
3599
|
+
if (resolvedStorage === "keychain") {
|
|
3600
|
+
await writeKeychainToken(provider, token);
|
|
3601
|
+
store.providers[provider] = {
|
|
3602
|
+
storage: "keychain",
|
|
3603
|
+
account: provider,
|
|
3604
|
+
service: keychainService()
|
|
3605
|
+
};
|
|
3606
|
+
} else {
|
|
3607
|
+
store.providers[provider] = {
|
|
3608
|
+
storage: "file",
|
|
3609
|
+
token
|
|
3610
|
+
};
|
|
3611
|
+
}
|
|
3612
|
+
await writeTokenStore(store);
|
|
3613
|
+
return resolvedStorage;
|
|
3614
|
+
};
|
|
3615
|
+
var deleteProviderToken = async (provider) => {
|
|
3616
|
+
const store = await readTokenStore();
|
|
3617
|
+
const entry = store.providers[provider];
|
|
3618
|
+
if (!entry) {
|
|
3619
|
+
return false;
|
|
3620
|
+
}
|
|
3621
|
+
if (entry.storage === "keychain") {
|
|
3622
|
+
try {
|
|
3623
|
+
await deleteKeychainToken(provider);
|
|
3624
|
+
} catch {
|
|
3625
|
+
}
|
|
3626
|
+
}
|
|
3627
|
+
delete store.providers[provider];
|
|
3628
|
+
await writeTokenStore(store);
|
|
3629
|
+
return true;
|
|
3630
|
+
};
|
|
3631
|
+
var resolveProviderToken = async (provider) => {
|
|
3632
|
+
const store = await readTokenStore();
|
|
3633
|
+
const entry = store.providers[provider];
|
|
3634
|
+
if (!entry) {
|
|
3635
|
+
return void 0;
|
|
3636
|
+
}
|
|
3637
|
+
if (entry.storage === "file") {
|
|
3638
|
+
return entry.token;
|
|
3639
|
+
}
|
|
3640
|
+
try {
|
|
3641
|
+
return await readKeychainToken(provider);
|
|
3642
|
+
} catch {
|
|
3643
|
+
return void 0;
|
|
3644
|
+
}
|
|
3645
|
+
};
|
|
3646
|
+
var getProviderTokenOrThrow = async (provider) => {
|
|
3647
|
+
const token = await resolveProviderToken(provider);
|
|
3648
|
+
if (!token) {
|
|
3649
|
+
throw new Error(`No token stored for provider: ${provider}`);
|
|
3650
|
+
}
|
|
3651
|
+
return token;
|
|
3652
|
+
};
|
|
3653
|
+
var resolveProviderEnvVar = (provider) => {
|
|
3654
|
+
switch (provider) {
|
|
3655
|
+
case "openai":
|
|
3656
|
+
return "OPENAI_API_KEY";
|
|
3657
|
+
case "anthropic":
|
|
3658
|
+
return "ANTHROPIC_API_KEY";
|
|
3659
|
+
case "google":
|
|
3660
|
+
return "GOOGLE_GENERATIVE_AI_API_KEY";
|
|
3661
|
+
case "opencode":
|
|
3662
|
+
return "OPENCODE_API_KEY";
|
|
3663
|
+
case "openrouter":
|
|
3664
|
+
return "OPENROUTER_API_KEY";
|
|
3665
|
+
default:
|
|
3666
|
+
return void 0;
|
|
3667
|
+
}
|
|
3668
|
+
};
|
|
3669
|
+
var maskToken = (token) => {
|
|
3670
|
+
if (token.length <= 8) {
|
|
3671
|
+
return "********";
|
|
3672
|
+
}
|
|
3673
|
+
return `${token.slice(0, 4)}...${token.slice(-4)}`;
|
|
3674
|
+
};
|
|
3675
|
+
|
|
3676
|
+
// src/llm/models.ts
|
|
3677
|
+
var openAiModelsUrl = "https://api.openai.com/v1/models";
|
|
3678
|
+
var anthropicModelsUrl = "https://api.anthropic.com/v1/models";
|
|
3679
|
+
var googleModelsUrl = "https://generativelanguage.googleapis.com/v1beta/models";
|
|
3680
|
+
var openRouterModelsUrl = "https://openrouter.ai/api/v1/models";
|
|
3681
|
+
var getTokenForProvider = async (provider) => {
|
|
3682
|
+
const envVar = resolveProviderEnvVar(provider);
|
|
3683
|
+
if (envVar && process.env[envVar]) {
|
|
3684
|
+
return process.env[envVar];
|
|
3685
|
+
}
|
|
3686
|
+
return await resolveProviderToken(provider);
|
|
3687
|
+
};
|
|
3688
|
+
var parseOpenAiModels = (json) => {
|
|
3689
|
+
const data = json?.data ?? [];
|
|
3690
|
+
return data.map((item) => item.id).filter((id) => typeof id === "string");
|
|
3691
|
+
};
|
|
3692
|
+
var parseAnthropicModels = (json) => {
|
|
3693
|
+
const data = json?.data ?? [];
|
|
3694
|
+
return data.map((item) => item.id).filter((id) => typeof id === "string");
|
|
3695
|
+
};
|
|
3696
|
+
var parseGoogleModels = (json) => {
|
|
3697
|
+
const data = json?.models ?? [];
|
|
3698
|
+
return data.map((item) => item.name).filter((name) => typeof name === "string").map((name) => name.replace(/^models\//, ""));
|
|
3699
|
+
};
|
|
3700
|
+
var parseOpenRouterModels = (json) => {
|
|
3701
|
+
const data = json?.data ?? [];
|
|
3702
|
+
return data.map((item) => item.id).filter((id) => typeof id === "string");
|
|
3703
|
+
};
|
|
3704
|
+
var requestModels = async (provider, token) => {
|
|
3705
|
+
if (provider === "openai") {
|
|
3706
|
+
const response = await fetch(openAiModelsUrl, {
|
|
3707
|
+
headers: { Authorization: `Bearer ${token}` }
|
|
3708
|
+
});
|
|
3709
|
+
if (!response.ok) {
|
|
3710
|
+
throw new Error(await response.text());
|
|
3711
|
+
}
|
|
3712
|
+
const json = await response.json();
|
|
3713
|
+
return parseOpenAiModels(json);
|
|
3714
|
+
}
|
|
3715
|
+
if (provider === "anthropic") {
|
|
3716
|
+
const response = await fetch(anthropicModelsUrl, {
|
|
3717
|
+
headers: {
|
|
3718
|
+
"x-api-key": token,
|
|
3719
|
+
"anthropic-version": "2023-06-01"
|
|
3720
|
+
}
|
|
3721
|
+
});
|
|
3722
|
+
if (!response.ok) {
|
|
3723
|
+
throw new Error(await response.text());
|
|
3724
|
+
}
|
|
3725
|
+
const json = await response.json();
|
|
3726
|
+
return parseAnthropicModels(json);
|
|
3727
|
+
}
|
|
3728
|
+
if (provider === "google") {
|
|
3729
|
+
const response = await fetch(`${googleModelsUrl}?key=${encodeURIComponent(token)}`);
|
|
3730
|
+
if (!response.ok) {
|
|
3731
|
+
throw new Error(await response.text());
|
|
3732
|
+
}
|
|
3733
|
+
const json = await response.json();
|
|
3734
|
+
return parseGoogleModels(json);
|
|
3735
|
+
}
|
|
3736
|
+
if (provider === "openrouter") {
|
|
3737
|
+
const response = await fetch(openRouterModelsUrl, {
|
|
3738
|
+
headers: { Authorization: `Bearer ${token}` }
|
|
3739
|
+
});
|
|
3740
|
+
if (!response.ok) {
|
|
3741
|
+
throw new Error(await response.text());
|
|
3742
|
+
}
|
|
3743
|
+
const json = await response.json();
|
|
3744
|
+
return parseOpenRouterModels(json);
|
|
3745
|
+
}
|
|
3746
|
+
if (provider === "opencode") {
|
|
3747
|
+
return [
|
|
3748
|
+
"gpt-5.2",
|
|
3749
|
+
"gpt-5.2-codex",
|
|
3750
|
+
"gpt-5.1",
|
|
3751
|
+
"gpt-5.1-codex",
|
|
3752
|
+
"gpt-5.1-codex-max",
|
|
3753
|
+
"gpt-5.1-codex-mini",
|
|
3754
|
+
"gpt-5",
|
|
3755
|
+
"gpt-5-codex",
|
|
3756
|
+
"gpt-5-nano",
|
|
3757
|
+
"claude-opus-4-6",
|
|
3758
|
+
"claude-opus-4-5",
|
|
3759
|
+
"claude-opus-4-1",
|
|
3760
|
+
"claude-sonnet-4-6",
|
|
3761
|
+
"claude-sonnet-4-5",
|
|
3762
|
+
"claude-sonnet-4",
|
|
3763
|
+
"claude-haiku-4-5",
|
|
3764
|
+
"claude-haiku-3.5",
|
|
3765
|
+
"gemini-3.1-pro",
|
|
3766
|
+
"gemini-3-pro",
|
|
3767
|
+
"gemini-3-flash",
|
|
3768
|
+
"minimax-m2.5",
|
|
3769
|
+
"minimax-m2.5-free",
|
|
3770
|
+
"minimax-m2.1",
|
|
3771
|
+
"glm-5",
|
|
3772
|
+
"glm-5-free",
|
|
3773
|
+
"glm-4.7",
|
|
3774
|
+
"glm-4.6",
|
|
3775
|
+
"kimi-k2.5",
|
|
3776
|
+
"kimi-k2.5-free",
|
|
3777
|
+
"kimi-k2-thinking",
|
|
3778
|
+
"kimi-k2",
|
|
3779
|
+
"qwen3-coder",
|
|
3780
|
+
"big-pickle"
|
|
3781
|
+
];
|
|
3782
|
+
}
|
|
3783
|
+
throw new Error(`Unsupported provider: ${provider}`);
|
|
3784
|
+
};
|
|
3785
|
+
var cheapestModelPreferences = {
|
|
3786
|
+
openai: ["gpt-4.1-nano", "gpt-4.1-mini", "gpt-4o-mini", "gpt-4o"],
|
|
3787
|
+
anthropic: ["claude-3-5-haiku", "claude-3-haiku"],
|
|
3788
|
+
google: ["gemini-1.5-flash-8b", "gemini-1.5-flash", "gemini-2.0-flash", "gemini-1.5-pro"],
|
|
3789
|
+
opencode: ["gpt-5-nano", "claude-haiku-3.5", "gemini-3-flash", "kimi-k2-free", "glm-5-free", "minimax-m2.5-free"],
|
|
3790
|
+
openrouter: ["openai/gpt-4o-mini", "anthropic/claude-3.5-haiku", "google/gemini-flash-1.5"]
|
|
3791
|
+
};
|
|
3792
|
+
var matchesPreference = (model, preference) => {
|
|
3793
|
+
return model === preference || model.startsWith(`${preference}-`);
|
|
3794
|
+
};
|
|
3795
|
+
var listProviderModels = async (provider) => {
|
|
3796
|
+
const token = await getTokenForProvider(provider);
|
|
3797
|
+
if (!token) {
|
|
3798
|
+
return { provider, ok: false, error: "No token available" };
|
|
3799
|
+
}
|
|
3800
|
+
try {
|
|
3801
|
+
const models = await requestModels(provider, token);
|
|
3802
|
+
return { provider, ok: true, models };
|
|
3803
|
+
} catch (error) {
|
|
3804
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
3805
|
+
return { provider, ok: false, error: message };
|
|
3806
|
+
}
|
|
3807
|
+
};
|
|
3808
|
+
var listAllProviderModels = async (providers) => {
|
|
3809
|
+
const results = await Promise.all(providers.map((provider) => listProviderModels(provider)));
|
|
3810
|
+
return results;
|
|
3811
|
+
};
|
|
3812
|
+
var pickCheapestModel = (provider, models) => {
|
|
3813
|
+
const preferences = cheapestModelPreferences[provider] ?? [];
|
|
3814
|
+
for (const preference of preferences) {
|
|
3815
|
+
const match = models.find((model) => matchesPreference(model, preference));
|
|
3816
|
+
if (match) {
|
|
3817
|
+
return match;
|
|
3818
|
+
}
|
|
3819
|
+
}
|
|
3820
|
+
return models[0];
|
|
3821
|
+
};
|
|
3822
|
+
var resolveCheapestModel = async (provider) => {
|
|
3823
|
+
const result = await listProviderModels(provider);
|
|
3824
|
+
if (!result.ok) {
|
|
3825
|
+
throw new Error(result.error ?? `Unable to list models for provider: ${provider}`);
|
|
3826
|
+
}
|
|
3827
|
+
const models = result.models ?? [];
|
|
3828
|
+
const model = pickCheapestModel(provider, models);
|
|
3829
|
+
if (!model) {
|
|
3830
|
+
throw new Error(`No models available for provider: ${provider}`);
|
|
3831
|
+
}
|
|
3832
|
+
return model;
|
|
3833
|
+
};
|
|
3834
|
+
|
|
3835
|
+
// src/llm/resolveModel.ts
|
|
3836
|
+
var resolveModel = async (model) => {
|
|
3837
|
+
globalThis.AI_SDK_LOG_WARNINGS ??= false;
|
|
3838
|
+
process.env.AI_SDK_LOG_WARNINGS ??= "false";
|
|
3839
|
+
const [provider, ...rest] = model.split("/");
|
|
3840
|
+
const modelName = rest.join("/");
|
|
3841
|
+
if (!provider || !modelName) {
|
|
3842
|
+
throw new Error(`Invalid model format: ${model}. Expected format: provider/model (e.g., openai/gpt-4)`);
|
|
3843
|
+
}
|
|
3844
|
+
const envVar = resolveProviderEnvVar(provider);
|
|
3845
|
+
if (envVar && !process.env[envVar]) {
|
|
3846
|
+
const storedToken = await resolveProviderToken(provider);
|
|
3847
|
+
if (storedToken) {
|
|
3848
|
+
process.env[envVar] = storedToken;
|
|
3849
|
+
}
|
|
3850
|
+
}
|
|
3851
|
+
switch (provider) {
|
|
3852
|
+
case "openai": {
|
|
3853
|
+
const { openai } = await import("@ai-sdk/openai");
|
|
3854
|
+
return openai(modelName);
|
|
3855
|
+
}
|
|
3856
|
+
case "anthropic": {
|
|
3857
|
+
const { anthropic } = await import("@ai-sdk/anthropic");
|
|
3858
|
+
return anthropic(modelName);
|
|
3859
|
+
}
|
|
3860
|
+
case "google": {
|
|
3861
|
+
const { google } = await import("@ai-sdk/google");
|
|
3862
|
+
return google(modelName);
|
|
3863
|
+
}
|
|
3864
|
+
case "opencode": {
|
|
3865
|
+
const envVar2 = resolveProviderEnvVar("opencode");
|
|
3866
|
+
let apiKey = envVar2 ? process.env[envVar2] : void 0;
|
|
3867
|
+
if (!apiKey) {
|
|
3868
|
+
apiKey = await resolveProviderToken("opencode");
|
|
3869
|
+
}
|
|
3870
|
+
if (!apiKey) {
|
|
3871
|
+
throw new Error("OpenCode API key is required. Set OPENCODE_API_KEY environment variable or run 'struktur auth set --provider opencode --token <token>'");
|
|
3872
|
+
}
|
|
3873
|
+
if (modelName.startsWith("claude-")) {
|
|
3874
|
+
const { createAnthropic } = await import("@ai-sdk/anthropic");
|
|
3875
|
+
return createAnthropic({
|
|
3876
|
+
apiKey,
|
|
3877
|
+
baseURL: "https://opencode.ai/zen/v1"
|
|
3878
|
+
})(modelName);
|
|
3879
|
+
} else if (modelName.startsWith("gemini-")) {
|
|
3880
|
+
const { createGoogleGenerativeAI } = await import("@ai-sdk/google");
|
|
3881
|
+
return createGoogleGenerativeAI({
|
|
3882
|
+
apiKey,
|
|
3883
|
+
baseURL: "https://opencode.ai/zen/v1"
|
|
3884
|
+
})(modelName);
|
|
3885
|
+
} else {
|
|
3886
|
+
const { createOpenAI } = await import("@ai-sdk/openai");
|
|
3887
|
+
return createOpenAI({
|
|
3888
|
+
apiKey,
|
|
3889
|
+
baseURL: "https://opencode.ai/zen/v1"
|
|
3890
|
+
})(modelName);
|
|
3891
|
+
}
|
|
3892
|
+
}
|
|
3893
|
+
case "openrouter": {
|
|
3894
|
+
const { openrouter } = await import("@openrouter/ai-sdk-provider");
|
|
3895
|
+
const hashIndex = modelName.indexOf("#");
|
|
3896
|
+
const actualModelName = hashIndex >= 0 ? modelName.slice(0, hashIndex) : modelName;
|
|
3897
|
+
const preferredProvider = hashIndex >= 0 ? modelName.slice(hashIndex + 1) : void 0;
|
|
3898
|
+
const modelInstance = openrouter(actualModelName);
|
|
3899
|
+
if (preferredProvider) {
|
|
3900
|
+
Object.defineProperty(modelInstance, "__openrouter_provider", {
|
|
3901
|
+
value: preferredProvider,
|
|
3902
|
+
writable: false,
|
|
3903
|
+
enumerable: false,
|
|
3904
|
+
configurable: false
|
|
3905
|
+
});
|
|
3906
|
+
}
|
|
3907
|
+
return modelInstance;
|
|
3908
|
+
}
|
|
3909
|
+
default:
|
|
3910
|
+
throw new Error(`Unsupported model provider: ${provider}. Supported providers: openai, anthropic, google, opencode, openrouter`);
|
|
3911
|
+
}
|
|
3912
|
+
};
|
|
3913
|
+
|
|
3914
|
+
// src/auth/config.ts
|
|
3915
|
+
import path4 from "path";
|
|
3916
|
+
import os3 from "os";
|
|
3917
|
+
import { chmod as chmod2, mkdir as mkdir2, readFile as readFile4, writeFile as writeFile3, stat as stat2 } from "fs/promises";
|
|
3918
|
+
var CONFIG_DIR_ENV2 = "STRUKTUR_CONFIG_DIR";
|
|
3919
|
+
var resolveConfigDir2 = () => {
|
|
3920
|
+
return process.env[CONFIG_DIR_ENV2] ?? path4.join(os3.homedir(), ".config", "struktur");
|
|
3921
|
+
};
|
|
3922
|
+
var resolveConfigPath = () => path4.join(resolveConfigDir2(), "config.json");
|
|
3923
|
+
var emptyStore2 = () => ({ version: 1 });
|
|
3924
|
+
var readConfigStore = async () => {
|
|
3925
|
+
const configPath = resolveConfigPath();
|
|
3926
|
+
try {
|
|
3927
|
+
await stat2(configPath);
|
|
3928
|
+
} catch {
|
|
3929
|
+
return emptyStore2();
|
|
3930
|
+
}
|
|
3931
|
+
const raw = await readFile4(configPath, "utf-8");
|
|
3932
|
+
const parsed = JSON.parse(raw);
|
|
3933
|
+
if (!parsed || parsed.version !== 1) {
|
|
3934
|
+
return emptyStore2();
|
|
3935
|
+
}
|
|
3936
|
+
return parsed;
|
|
3937
|
+
};
|
|
3938
|
+
var writeConfigStore = async (store) => {
|
|
3939
|
+
const configDir = resolveConfigDir2();
|
|
3940
|
+
const configPath = resolveConfigPath();
|
|
3941
|
+
await mkdir2(configDir, { recursive: true, mode: 448 });
|
|
3942
|
+
await writeFile3(configPath, JSON.stringify(store, null, 2));
|
|
3943
|
+
await chmod2(configDir, 448);
|
|
3944
|
+
await chmod2(configPath, 384);
|
|
3945
|
+
};
|
|
3946
|
+
var getDefaultModel = async () => {
|
|
3947
|
+
const store = await readConfigStore();
|
|
3948
|
+
return store.defaultModel;
|
|
3949
|
+
};
|
|
3950
|
+
var setDefaultModel = async (model) => {
|
|
3951
|
+
const store = await readConfigStore();
|
|
3952
|
+
store.defaultModel = model;
|
|
3953
|
+
await writeConfigStore(store);
|
|
3954
|
+
return model;
|
|
3955
|
+
};
|
|
3956
|
+
var listAliases = async () => {
|
|
3957
|
+
const store = await readConfigStore();
|
|
3958
|
+
return store.aliases ?? {};
|
|
3959
|
+
};
|
|
3960
|
+
var getAlias = async (alias) => {
|
|
3961
|
+
const store = await readConfigStore();
|
|
3962
|
+
return store.aliases?.[alias];
|
|
3963
|
+
};
|
|
3964
|
+
var setAlias = async (alias, model) => {
|
|
3965
|
+
const store = await readConfigStore();
|
|
3966
|
+
store.aliases ??= {};
|
|
3967
|
+
store.aliases[alias] = model;
|
|
3968
|
+
await writeConfigStore(store);
|
|
3969
|
+
return model;
|
|
3970
|
+
};
|
|
3971
|
+
var deleteAlias = async (alias) => {
|
|
3972
|
+
const store = await readConfigStore();
|
|
3973
|
+
if (!store.aliases?.[alias]) {
|
|
3974
|
+
return false;
|
|
3975
|
+
}
|
|
3976
|
+
delete store.aliases[alias];
|
|
3977
|
+
await writeConfigStore(store);
|
|
3978
|
+
return true;
|
|
3979
|
+
};
|
|
3980
|
+
var resolveAlias = async (modelSpec) => {
|
|
3981
|
+
const aliases = await listAliases();
|
|
3982
|
+
return aliases[modelSpec] ?? modelSpec;
|
|
3983
|
+
};
|
|
3984
|
+
var listParsers = async () => {
|
|
3985
|
+
const store = await readConfigStore();
|
|
3986
|
+
return store.parsers ?? {};
|
|
3987
|
+
};
|
|
3988
|
+
var getParser = async (mimeType) => {
|
|
3989
|
+
const store = await readConfigStore();
|
|
3990
|
+
return store.parsers?.[mimeType];
|
|
3991
|
+
};
|
|
3992
|
+
var setParser = async (mimeType, def) => {
|
|
3993
|
+
if (def.type === "command-file" && !def.command.includes("FILE_PATH")) {
|
|
3994
|
+
throw new Error(
|
|
3995
|
+
`command-file parser must contain FILE_PATH placeholder in the command string. Got: "${def.command}"`
|
|
3996
|
+
);
|
|
3997
|
+
}
|
|
3998
|
+
const store = await readConfigStore();
|
|
3999
|
+
store.parsers ??= {};
|
|
4000
|
+
store.parsers[mimeType] = def;
|
|
4001
|
+
await writeConfigStore(store);
|
|
4002
|
+
};
|
|
4003
|
+
var deleteParser = async (mimeType) => {
|
|
4004
|
+
const store = await readConfigStore();
|
|
4005
|
+
if (!store.parsers?.[mimeType]) {
|
|
4006
|
+
return false;
|
|
4007
|
+
}
|
|
4008
|
+
delete store.parsers[mimeType];
|
|
4009
|
+
await writeConfigStore(store);
|
|
4010
|
+
return true;
|
|
4011
|
+
};
|
|
4012
|
+
var getTelemetryConfig = async () => {
|
|
4013
|
+
const store = await readConfigStore();
|
|
4014
|
+
return store.telemetry;
|
|
4015
|
+
};
|
|
4016
|
+
var setTelemetryConfig = async (config) => {
|
|
4017
|
+
const store = await readConfigStore();
|
|
4018
|
+
store.telemetry = config;
|
|
4019
|
+
await writeConfigStore(store);
|
|
4020
|
+
};
|
|
4021
|
+
var enableTelemetry = async (provider, options) => {
|
|
4022
|
+
const store = await readConfigStore();
|
|
4023
|
+
store.telemetry = {
|
|
4024
|
+
enabled: true,
|
|
4025
|
+
provider,
|
|
4026
|
+
...options
|
|
4027
|
+
};
|
|
4028
|
+
await writeConfigStore(store);
|
|
4029
|
+
};
|
|
4030
|
+
var disableTelemetry = async () => {
|
|
4031
|
+
const store = await readConfigStore();
|
|
4032
|
+
if (store.telemetry) {
|
|
4033
|
+
store.telemetry.enabled = false;
|
|
4034
|
+
}
|
|
4035
|
+
await writeConfigStore(store);
|
|
4036
|
+
};
|
|
4037
|
+
var deleteTelemetryConfig = async () => {
|
|
4038
|
+
const store = await readConfigStore();
|
|
4039
|
+
if (!store.telemetry) {
|
|
4040
|
+
return false;
|
|
4041
|
+
}
|
|
4042
|
+
delete store.telemetry;
|
|
4043
|
+
await writeConfigStore(store);
|
|
4044
|
+
return true;
|
|
4045
|
+
};
|
|
4046
|
+
export {
|
|
4047
|
+
AgentStrategy,
|
|
4048
|
+
DoublePassAutoMergeStrategy,
|
|
4049
|
+
DoublePassStrategy,
|
|
4050
|
+
ParallelAutoMergeStrategy,
|
|
4051
|
+
ParallelStrategy,
|
|
4052
|
+
SchemaValidationError,
|
|
4053
|
+
SequentialAutoMergeStrategy,
|
|
4054
|
+
SequentialStrategy,
|
|
4055
|
+
SimpleStrategy,
|
|
4056
|
+
agent,
|
|
4057
|
+
buildSchemaFromFields,
|
|
4058
|
+
buildSchemaFromParsedFields,
|
|
4059
|
+
clearArtifactInputParsers,
|
|
4060
|
+
collectStream,
|
|
4061
|
+
createDebugLogger,
|
|
4062
|
+
defaultArtifactProviders,
|
|
4063
|
+
deleteAlias,
|
|
4064
|
+
deleteParser,
|
|
4065
|
+
deleteProviderToken,
|
|
4066
|
+
deleteTelemetryConfig,
|
|
4067
|
+
detectMimeType,
|
|
4068
|
+
disableTelemetry,
|
|
4069
|
+
doublePass,
|
|
4070
|
+
doublePassAutoMerge,
|
|
4071
|
+
enableTelemetry,
|
|
4072
|
+
extract,
|
|
4073
|
+
fileToArtifact,
|
|
4074
|
+
getAlias,
|
|
4075
|
+
getDefaultModel,
|
|
4076
|
+
getParser,
|
|
4077
|
+
getProviderTokenOrThrow,
|
|
4078
|
+
getTelemetryConfig,
|
|
4079
|
+
hydrateSerializedArtifacts,
|
|
4080
|
+
listAliases,
|
|
4081
|
+
listAllProviderModels,
|
|
4082
|
+
listParsers,
|
|
4083
|
+
listProviderModels,
|
|
4084
|
+
listStoredProviders,
|
|
4085
|
+
maskToken,
|
|
4086
|
+
parallel,
|
|
4087
|
+
parallelAutoMerge,
|
|
4088
|
+
parse,
|
|
4089
|
+
parseFieldsString,
|
|
4090
|
+
parsePdf,
|
|
4091
|
+
parseSerializedArtifacts,
|
|
4092
|
+
registerArtifactInputParser,
|
|
4093
|
+
resolveAlias,
|
|
4094
|
+
resolveCheapestModel,
|
|
4095
|
+
resolveModel,
|
|
4096
|
+
resolveProviderEnvVar,
|
|
4097
|
+
resolveProviderToken,
|
|
4098
|
+
runParser,
|
|
4099
|
+
sequential,
|
|
4100
|
+
sequentialAutoMerge,
|
|
4101
|
+
setAlias,
|
|
4102
|
+
setDefaultModel,
|
|
4103
|
+
setParser,
|
|
4104
|
+
setProviderToken,
|
|
4105
|
+
setTelemetryConfig,
|
|
4106
|
+
simple,
|
|
4107
|
+
splitTextIntoContents,
|
|
4108
|
+
urlToArtifact,
|
|
4109
|
+
validateSerializedArtifacts
|
|
4110
|
+
};
|
|
4111
|
+
//# sourceMappingURL=index.js.map
|