@struktur/sdk 2.1.1 → 2.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. package/dist/index.js +4111 -0
  2. package/dist/index.js.map +1 -0
  3. package/dist/parsers.js +492 -0
  4. package/dist/parsers.js.map +1 -0
  5. package/dist/strategies.js +2435 -0
  6. package/dist/strategies.js.map +1 -0
  7. package/package.json +25 -13
  8. package/src/agent-cli-integration.test.ts +0 -47
  9. package/src/agent-export.test.ts +0 -17
  10. package/src/agent-tool-labels.test.ts +0 -50
  11. package/src/artifacts/AGENTS.md +0 -16
  12. package/src/artifacts/fileToArtifact.test.ts +0 -37
  13. package/src/artifacts/fileToArtifact.ts +0 -44
  14. package/src/artifacts/input.test.ts +0 -243
  15. package/src/artifacts/input.ts +0 -360
  16. package/src/artifacts/providers.test.ts +0 -19
  17. package/src/artifacts/providers.ts +0 -7
  18. package/src/artifacts/urlToArtifact.test.ts +0 -23
  19. package/src/artifacts/urlToArtifact.ts +0 -19
  20. package/src/auth/AGENTS.md +0 -11
  21. package/src/auth/config.test.ts +0 -132
  22. package/src/auth/config.ts +0 -186
  23. package/src/auth/tokens.test.ts +0 -58
  24. package/src/auth/tokens.ts +0 -229
  25. package/src/chunking/AGENTS.md +0 -11
  26. package/src/chunking/ArtifactBatcher.test.ts +0 -22
  27. package/src/chunking/ArtifactBatcher.ts +0 -110
  28. package/src/chunking/ArtifactSplitter.test.ts +0 -38
  29. package/src/chunking/ArtifactSplitter.ts +0 -151
  30. package/src/debug/AGENTS.md +0 -79
  31. package/src/debug/logger.test.ts +0 -244
  32. package/src/debug/logger.ts +0 -211
  33. package/src/extract.test.ts +0 -22
  34. package/src/extract.ts +0 -150
  35. package/src/fields.test.ts +0 -681
  36. package/src/fields.ts +0 -246
  37. package/src/index.test.ts +0 -20
  38. package/src/index.ts +0 -110
  39. package/src/llm/AGENTS.md +0 -9
  40. package/src/llm/LLMClient.test.ts +0 -394
  41. package/src/llm/LLMClient.ts +0 -264
  42. package/src/llm/RetryingRunner.test.ts +0 -174
  43. package/src/llm/RetryingRunner.ts +0 -270
  44. package/src/llm/message.test.ts +0 -42
  45. package/src/llm/message.ts +0 -47
  46. package/src/llm/models.test.ts +0 -82
  47. package/src/llm/models.ts +0 -190
  48. package/src/llm/resolveModel.ts +0 -86
  49. package/src/merge/AGENTS.md +0 -6
  50. package/src/merge/Deduplicator.test.ts +0 -108
  51. package/src/merge/Deduplicator.ts +0 -45
  52. package/src/merge/SmartDataMerger.test.ts +0 -177
  53. package/src/merge/SmartDataMerger.ts +0 -56
  54. package/src/parsers/AGENTS.md +0 -58
  55. package/src/parsers/collect.test.ts +0 -56
  56. package/src/parsers/collect.ts +0 -31
  57. package/src/parsers/index.ts +0 -6
  58. package/src/parsers/mime.test.ts +0 -91
  59. package/src/parsers/mime.ts +0 -137
  60. package/src/parsers/npm.ts +0 -26
  61. package/src/parsers/pdf.test.ts +0 -394
  62. package/src/parsers/pdf.ts +0 -194
  63. package/src/parsers/runner.test.ts +0 -95
  64. package/src/parsers/runner.ts +0 -177
  65. package/src/parsers/types.ts +0 -29
  66. package/src/prompts/AGENTS.md +0 -8
  67. package/src/prompts/DeduplicationPrompt.test.ts +0 -41
  68. package/src/prompts/DeduplicationPrompt.ts +0 -37
  69. package/src/prompts/ExtractorPrompt.test.ts +0 -21
  70. package/src/prompts/ExtractorPrompt.ts +0 -72
  71. package/src/prompts/ParallelMergerPrompt.test.ts +0 -8
  72. package/src/prompts/ParallelMergerPrompt.ts +0 -37
  73. package/src/prompts/SequentialExtractorPrompt.test.ts +0 -24
  74. package/src/prompts/SequentialExtractorPrompt.ts +0 -82
  75. package/src/prompts/formatArtifacts.test.ts +0 -39
  76. package/src/prompts/formatArtifacts.ts +0 -46
  77. package/src/strategies/AGENTS.md +0 -6
  78. package/src/strategies/DoublePassAutoMergeStrategy.test.ts +0 -53
  79. package/src/strategies/DoublePassAutoMergeStrategy.ts +0 -410
  80. package/src/strategies/DoublePassStrategy.test.ts +0 -48
  81. package/src/strategies/DoublePassStrategy.ts +0 -266
  82. package/src/strategies/ParallelAutoMergeStrategy.test.ts +0 -152
  83. package/src/strategies/ParallelAutoMergeStrategy.ts +0 -345
  84. package/src/strategies/ParallelStrategy.test.ts +0 -61
  85. package/src/strategies/ParallelStrategy.ts +0 -208
  86. package/src/strategies/SequentialAutoMergeStrategy.test.ts +0 -66
  87. package/src/strategies/SequentialAutoMergeStrategy.ts +0 -325
  88. package/src/strategies/SequentialStrategy.test.ts +0 -53
  89. package/src/strategies/SequentialStrategy.ts +0 -142
  90. package/src/strategies/SimpleStrategy.test.ts +0 -46
  91. package/src/strategies/SimpleStrategy.ts +0 -94
  92. package/src/strategies/concurrency.test.ts +0 -16
  93. package/src/strategies/concurrency.ts +0 -14
  94. package/src/strategies/index.test.ts +0 -20
  95. package/src/strategies/index.ts +0 -7
  96. package/src/strategies/utils.test.ts +0 -76
  97. package/src/strategies/utils.ts +0 -95
  98. package/src/tokenization.test.ts +0 -119
  99. package/src/tokenization.ts +0 -71
  100. package/src/types.test.ts +0 -25
  101. package/src/types.ts +0 -174
  102. package/src/validation/AGENTS.md +0 -7
  103. package/src/validation/validator.test.ts +0 -204
  104. package/src/validation/validator.ts +0 -90
  105. package/tsconfig.json +0 -22
@@ -0,0 +1,2435 @@
1
+ // src/prompts/formatArtifacts.ts
2
+ var imageRefFor = (artifactId, index, image) => {
3
+ if (image.url) {
4
+ return image.url;
5
+ }
6
+ const extension = image.base64 ? "png" : "bin";
7
+ return `artifact:${artifactId}/images/image${index + 1}.${extension}`;
8
+ };
9
+ var escapeXml = (value) => {
10
+ return value.replace(/&/g, "&amp;").replace(/</g, "&lt;").replace(/>/g, "&gt;").replace(/\"/g, "&quot;").replace(/'/g, "&apos;");
11
+ };
12
+ var formatArtifactsXml = (artifacts) => {
13
+ const parts = [];
14
+ for (const artifact of artifacts) {
15
+ parts.push(`<artifact id="${escapeXml(artifact.id)}" type="${artifact.type}">`);
16
+ for (const content of artifact.contents) {
17
+ if (content.text) {
18
+ const pageAttr = content.page !== void 0 ? ` page="${content.page}"` : "";
19
+ parts.push(` <text${pageAttr}>${escapeXml(content.text)}</text>`);
20
+ }
21
+ if (content.media?.length) {
22
+ content.media.forEach((media, index) => {
23
+ const ref = imageRefFor(artifact.id, index, media);
24
+ const pageAttr = content.page !== void 0 ? ` page="${content.page}"` : "";
25
+ parts.push(` <image ref="${escapeXml(ref)}"${pageAttr} />`);
26
+ });
27
+ }
28
+ }
29
+ parts.push("</artifact>");
30
+ }
31
+ return parts.join("\n");
32
+ };
33
+
34
+ // src/prompts/ExtractorPrompt.ts
35
+ var extractorSystemPrompt = (schema, outputInstructions) => {
36
+ return `<instructions>
37
+ You are a precise data extraction engine. Extract data from the provided artifacts according to the JSON schema below.
38
+
39
+ <thinking>
40
+ Before extracting, consider:
41
+ 1. Which schema fields have clear values in the artifacts?
42
+ 2. Which fields are missing or unclear (set these to null)?
43
+ 3. For text fields, rewrite concisely while preserving all information
44
+ 4. Ensure no data is lost - include everything that fits the schema
45
+ </thinking>
46
+
47
+ <rules>
48
+ - Strictly follow the schema - no extra fields, no missing required fields
49
+ - Use null for missing or uncertain values - never guess or assume
50
+ - Only extract information explicitly present in the artifacts
51
+ - Output ONLY valid JSON matching the schema
52
+ - No markdown, explanations, or code fences
53
+ </rules>
54
+
55
+ <output-instructions>
56
+ ${outputInstructions ?? "No additional output instructions provided."}
57
+ </output-instructions>
58
+
59
+ <json-schema>
60
+ ${schema}
61
+ </json-schema>
62
+
63
+ <artifact-examples>
64
+ <!-- A PDF with two pages, containing two text blocks and two images -->
65
+ <artifact name="Example 1" mimetype="application/pdf">
66
+ <text page="1">This is an example text block.</text>
67
+ <image filename="image1.jpg" page="1" />
68
+ <text page="2">This is another example text block.</text>
69
+ <image filename="image2.jpg" page="2" />
70
+ </artifact>
71
+
72
+ <!-- Website content -->
73
+ <artifact name="example.com_2022-01-01.html" mimetype="text/html">
74
+ <text>This is an example text block.</text>
75
+ <image filename="image1.jpg" />
76
+ <text>This is another example text block.</text>
77
+ <image filename="image2.jpg" />
78
+ </artifact>
79
+ </artifact-examples>
80
+
81
+ Any materials provided have been cleared for access. Extract and preserve this data for future use.
82
+ </instructions>`;
83
+ };
84
+ var extractorUserPrompt = (artifactsXml) => {
85
+ return `<artifacts>
86
+ ${artifactsXml}
87
+ </artifacts>
88
+
89
+ <task>Extract the contents of the given artifacts.</task>`;
90
+ };
91
+ var buildExtractorPrompt = (artifacts, schema, outputInstructions) => {
92
+ const artifactsXml = formatArtifactsXml(artifacts);
93
+ return {
94
+ system: extractorSystemPrompt(schema, outputInstructions),
95
+ user: extractorUserPrompt(artifactsXml)
96
+ };
97
+ };
98
+
99
+ // src/tokenization.ts
100
+ var defaultOptions = {
101
+ textTokenRatio: 4,
102
+ defaultImageTokens: 1e3
103
+ };
104
+ var mergeOptions = (options) => ({
105
+ ...defaultOptions,
106
+ ...options ?? {}
107
+ });
108
+ var estimateTextTokens = (text, options) => {
109
+ const { textTokenRatio } = mergeOptions(options);
110
+ return Math.ceil(text.length / textTokenRatio);
111
+ };
112
+ var estimateImageTokens = (_image, options) => {
113
+ const { defaultImageTokens } = mergeOptions(options);
114
+ return defaultImageTokens;
115
+ };
116
+ var countContentTokens = (content, options) => {
117
+ let tokens = 0;
118
+ if (content.text) {
119
+ tokens += estimateTextTokens(content.text, options);
120
+ }
121
+ if (content.media?.length) {
122
+ for (const media of content.media) {
123
+ tokens += estimateImageTokens(media, options);
124
+ if (media.text) {
125
+ tokens += estimateTextTokens(media.text, options);
126
+ }
127
+ }
128
+ }
129
+ return tokens;
130
+ };
131
+ var countArtifactTokens = (artifact, options) => {
132
+ if (typeof artifact.tokens === "number") {
133
+ return artifact.tokens;
134
+ }
135
+ return artifact.contents.reduce(
136
+ (total, content) => total + countContentTokens(content, options),
137
+ 0
138
+ );
139
+ };
140
+ var countArtifactImages = (artifact) => {
141
+ return artifact.contents.reduce((count, content) => {
142
+ return count + (content.media?.length ?? 0);
143
+ }, 0);
144
+ };
145
+
146
+ // src/chunking/ArtifactSplitter.ts
147
+ var splitTextIntoChunks = (content, maxTokens, options, debug, artifactId) => {
148
+ if (!content.text) {
149
+ return [content];
150
+ }
151
+ const totalTokens = estimateTextTokens(content.text, options);
152
+ if (totalTokens <= maxTokens) {
153
+ return [content];
154
+ }
155
+ const ratio = options?.textTokenRatio ?? 4;
156
+ const chunkSize = Math.max(1, maxTokens * ratio);
157
+ const chunks = [];
158
+ if (debug && artifactId) {
159
+ debug.chunkingSplit({
160
+ artifactId,
161
+ originalContentCount: 1,
162
+ splitContentCount: Math.ceil(content.text.length / chunkSize),
163
+ splitReason: "text_too_long",
164
+ originalTokens: totalTokens,
165
+ chunkSize
166
+ });
167
+ }
168
+ for (let offset = 0; offset < content.text.length; offset += chunkSize) {
169
+ const text = content.text.slice(offset, offset + chunkSize);
170
+ chunks.push({
171
+ page: content.page,
172
+ text,
173
+ media: offset === 0 ? content.media : void 0
174
+ });
175
+ }
176
+ return chunks;
177
+ };
178
+ var splitArtifact = (artifact, options) => {
179
+ const { maxTokens, maxImages, debug } = options;
180
+ const splitContents = [];
181
+ const totalTokens = countArtifactTokens(artifact, options);
182
+ debug?.chunkingStart({
183
+ artifactId: artifact.id,
184
+ totalTokens,
185
+ maxTokens,
186
+ maxImages
187
+ });
188
+ for (const content of artifact.contents) {
189
+ splitContents.push(...splitTextIntoChunks(content, maxTokens, options, debug, artifact.id));
190
+ }
191
+ const chunks = [];
192
+ let currentContents = [];
193
+ let currentTokens = 0;
194
+ let currentImages = 0;
195
+ for (const content of splitContents) {
196
+ const contentTokens = countContentTokens(content, options);
197
+ const contentImages = content.media?.length ?? 0;
198
+ const exceedsTokens = currentContents.length > 0 && currentTokens + contentTokens > maxTokens;
199
+ const exceedsImages = maxImages !== void 0 && currentContents.length > 0 && currentImages + contentImages > maxImages;
200
+ if (exceedsTokens || exceedsImages) {
201
+ if (debug) {
202
+ debug.chunkingSplit({
203
+ artifactId: artifact.id,
204
+ originalContentCount: splitContents.length,
205
+ splitContentCount: chunks.length + 1,
206
+ splitReason: exceedsTokens ? "content_limit" : "content_limit",
207
+ originalTokens: totalTokens,
208
+ chunkSize: maxTokens
209
+ });
210
+ }
211
+ chunks.push({
212
+ ...artifact,
213
+ id: `${artifact.id}:part:${chunks.length + 1}`,
214
+ contents: currentContents,
215
+ tokens: currentTokens
216
+ });
217
+ currentContents = [];
218
+ currentTokens = 0;
219
+ currentImages = 0;
220
+ }
221
+ currentContents.push(content);
222
+ currentTokens += contentTokens;
223
+ currentImages += contentImages;
224
+ }
225
+ if (currentContents.length > 0) {
226
+ chunks.push({
227
+ ...artifact,
228
+ id: `${artifact.id}:part:${chunks.length + 1}`,
229
+ contents: currentContents,
230
+ tokens: currentTokens
231
+ });
232
+ }
233
+ if (chunks.length === 0) {
234
+ chunks.push({
235
+ ...artifact,
236
+ id: `${artifact.id}:part:1`,
237
+ tokens: countArtifactTokens(artifact, options)
238
+ });
239
+ }
240
+ debug?.chunkingResult({
241
+ artifactId: artifact.id,
242
+ chunksCreated: chunks.length,
243
+ chunkSizes: chunks.map((c) => c.tokens ?? 0)
244
+ });
245
+ return chunks;
246
+ };
247
+
248
+ // src/chunking/ArtifactBatcher.ts
249
+ var batchArtifacts = (artifacts, options) => {
250
+ const debug = options.debug;
251
+ const maxTokens = options.modelMaxTokens ? Math.min(options.maxTokens, options.modelMaxTokens) : options.maxTokens;
252
+ debug?.batchingStart({
253
+ totalArtifacts: artifacts.length,
254
+ maxTokens: options.maxTokens,
255
+ maxImages: options.maxImages,
256
+ modelMaxTokens: options.modelMaxTokens,
257
+ effectiveMaxTokens: maxTokens
258
+ });
259
+ const batches = [];
260
+ let currentBatch = [];
261
+ let currentTokens = 0;
262
+ let currentImages = 0;
263
+ for (const artifact of artifacts) {
264
+ const splitOptions = {
265
+ maxTokens,
266
+ debug
267
+ };
268
+ if (options.maxImages !== void 0) splitOptions.maxImages = options.maxImages;
269
+ if (options.textTokenRatio !== void 0) splitOptions.textTokenRatio = options.textTokenRatio;
270
+ if (options.defaultImageTokens !== void 0) splitOptions.defaultImageTokens = options.defaultImageTokens;
271
+ const splits = splitArtifact(artifact, splitOptions);
272
+ for (const split of splits) {
273
+ const splitTokens = countArtifactTokens(split, options);
274
+ const splitImages = countArtifactImages(split);
275
+ const exceedsTokens = currentBatch.length > 0 && currentTokens + splitTokens > maxTokens;
276
+ const exceedsImages = options.maxImages !== void 0 && currentBatch.length > 0 && currentImages + splitImages > options.maxImages;
277
+ if (exceedsTokens || exceedsImages) {
278
+ debug?.batchCreated({
279
+ batchIndex: batches.length,
280
+ artifactCount: currentBatch.length,
281
+ totalTokens: currentTokens,
282
+ totalImages: currentImages,
283
+ artifactIds: currentBatch.map((a) => a.id)
284
+ });
285
+ batches.push(currentBatch);
286
+ currentBatch = [];
287
+ currentTokens = 0;
288
+ currentImages = 0;
289
+ }
290
+ currentBatch.push(split);
291
+ currentTokens += splitTokens;
292
+ currentImages += splitImages;
293
+ }
294
+ }
295
+ if (currentBatch.length > 0) {
296
+ debug?.batchCreated({
297
+ batchIndex: batches.length,
298
+ artifactCount: currentBatch.length,
299
+ totalTokens: currentTokens,
300
+ totalImages: currentImages,
301
+ artifactIds: currentBatch.map((a) => a.id)
302
+ });
303
+ batches.push(currentBatch);
304
+ }
305
+ debug?.batchingComplete({
306
+ totalBatches: batches.length,
307
+ batches: batches.map((batch, index) => ({
308
+ index,
309
+ artifactCount: batch.length,
310
+ tokens: batch.reduce((sum, a) => sum + (a.tokens ?? 0), 0),
311
+ images: batch.reduce(
312
+ (sum, a) => sum + a.contents.reduce((c, content) => c + (content.media?.length ?? 0), 0),
313
+ 0
314
+ )
315
+ }))
316
+ });
317
+ return batches;
318
+ };
319
+
320
+ // src/llm/message.ts
321
+ var collectImages = (artifacts) => {
322
+ const parts = [];
323
+ for (const artifact of artifacts) {
324
+ for (const content of artifact.contents) {
325
+ if (!content.media?.length) {
326
+ continue;
327
+ }
328
+ for (const media of content.media) {
329
+ if (media.contents) {
330
+ parts.push({ type: "image", image: media.contents });
331
+ } else if (media.base64) {
332
+ parts.push({ type: "image", image: media.base64 });
333
+ } else if (media.url) {
334
+ parts.push({ type: "image", image: media.url });
335
+ }
336
+ }
337
+ }
338
+ }
339
+ return parts;
340
+ };
341
+ var buildUserContent = (text, artifacts) => {
342
+ const images = collectImages(artifacts);
343
+ if (images.length === 0) {
344
+ return text;
345
+ }
346
+ return [{ type: "text", text }, ...images];
347
+ };
348
+
349
+ // src/validation/validator.ts
350
+ import Ajv from "ajv";
351
+ import addFormats from "ajv-formats";
352
+ var SchemaValidationError = class extends Error {
353
+ errors;
354
+ constructor(message, errors) {
355
+ super(message);
356
+ this.name = "SchemaValidationError";
357
+ this.errors = errors;
358
+ }
359
+ };
360
+ var ARTIFACT_ID_PATTERN = /^artifact:[^/]+\/images\/image\d+\.\w+$/;
361
+ var createAjv = () => {
362
+ const ajv = new Ajv({
363
+ allErrors: true,
364
+ strict: false,
365
+ allowUnionTypes: true
366
+ });
367
+ addFormats(ajv);
368
+ ajv.addFormat("artifact-id", {
369
+ type: "string",
370
+ validate: (data) => ARTIFACT_ID_PATTERN.test(data)
371
+ });
372
+ return ajv;
373
+ };
374
+ var validateOrThrow = (ajv, schema, data) => {
375
+ const validate = ajv.compile(schema);
376
+ const valid = validate(data);
377
+ if (!valid) {
378
+ const errors = validate.errors ?? [];
379
+ const message = "Schema validation failed";
380
+ throw new SchemaValidationError(message, errors);
381
+ }
382
+ return data;
383
+ };
384
+ var isRequiredError = (error) => {
385
+ return error.keyword === "required";
386
+ };
387
+ var validateAllowingMissingRequired = (ajv, schema, data, isFinalAttempt = true) => {
388
+ const validate = ajv.compile(schema);
389
+ const valid = validate(data);
390
+ if (valid) {
391
+ return { valid: true, data };
392
+ }
393
+ const errors = validate.errors ?? [];
394
+ const nonRequiredErrors = errors.filter((error) => !isRequiredError(error));
395
+ if (nonRequiredErrors.length === 0) {
396
+ if (isFinalAttempt) {
397
+ return { valid: true, data };
398
+ }
399
+ return { valid: false, errors };
400
+ }
401
+ return { valid: false, errors: nonRequiredErrors };
402
+ };
403
+
404
+ // src/llm/LLMClient.ts
405
+ import { generateText, Output, jsonSchema } from "ai";
406
+ var isZodSchema = (schema) => {
407
+ return typeof schema === "object" && schema !== null && "safeParse" in schema && typeof schema.safeParse === "function";
408
+ };
409
+ var generateStructured = async (request) => {
410
+ const { telemetry, parentSpan } = request;
411
+ const llmSpan = telemetry?.startSpan({
412
+ name: "llm.generateStructured",
413
+ kind: "LLM",
414
+ parentSpan,
415
+ attributes: {
416
+ "llm.schema_name": request.schemaName ?? "extract",
417
+ "llm.strict": request.strict ?? false
418
+ }
419
+ });
420
+ const startTime = Date.now();
421
+ const schema = isZodSchema(request.schema) ? request.schema : jsonSchema(request.schema);
422
+ const preferredProvider = request.model?.__openrouter_provider;
423
+ if (preferredProvider && process.env.DEBUG) {
424
+ console.error(
425
+ `[DEBUG] Routing to OpenRouter provider: ${preferredProvider}`
426
+ );
427
+ }
428
+ const providerOptions = preferredProvider ? {
429
+ openrouter: {
430
+ provider: {
431
+ order: [preferredProvider]
432
+ }
433
+ }
434
+ } : void 0;
435
+ let result;
436
+ try {
437
+ result = await generateText({
438
+ model: request.model,
439
+ output: Output.object({
440
+ schema,
441
+ name: request.schemaName ?? "extract",
442
+ description: request.schemaDescription
443
+ }),
444
+ providerOptions: {
445
+ openai: {
446
+ strictJsonSchema: request.strict ?? false
447
+ }
448
+ },
449
+ system: request.system,
450
+ messages: request.messages ?? [
451
+ { role: "user", content: request.user }
452
+ ],
453
+ ...providerOptions ? { providerOptions } : {}
454
+ });
455
+ } catch (error) {
456
+ const modelId = typeof request.model === "object" && request.model !== null ? request.model.modelId ?? JSON.stringify(request.model) : String(request.model);
457
+ if (error && typeof error === "object" && "responseBody" in error && "statusCode" in error) {
458
+ const apiError = error;
459
+ const responseBody = apiError.responseBody;
460
+ const errorData = apiError.data;
461
+ if (typeof responseBody === "string" && responseBody.includes("No endpoints found that support image input")) {
462
+ throw new Error(
463
+ `Model "${modelId}" does not support image input. Please use a model that supports images (e.g., gpt-4o, claude-3-5-sonnet, gemini-1.5-pro) or remove the --images and --screenshots flags.`
464
+ );
465
+ }
466
+ if (errorData?.code === 500 || errorData?.message?.includes("Internal Server Error")) {
467
+ throw new Error(
468
+ `Provider error for model "${modelId}": Internal server error. The model or provider may be experiencing issues. Please try again or use a different model.`
469
+ );
470
+ }
471
+ if (apiError.statusCode === 401 || errorData?.code === 401) {
472
+ throw new Error(
473
+ `Authentication failed for model "${modelId}". Please check your API key is valid and has the necessary permissions.`
474
+ );
475
+ }
476
+ if (apiError.statusCode === 403 || errorData?.code === 403) {
477
+ throw new Error(
478
+ `Access denied for model "${modelId}". Your API key may not have access to this model. Please check your subscription or try a different model.`
479
+ );
480
+ }
481
+ if (apiError.statusCode === 429 || errorData?.code === 429) {
482
+ throw new Error(
483
+ `Rate limit exceeded for model "${modelId}". Please wait a moment and try again, or use a different model.`
484
+ );
485
+ }
486
+ if (apiError.statusCode === 404 || errorData?.code === 404) {
487
+ const errorMsg = errorData?.message || "Model not found";
488
+ throw new Error(
489
+ `Model "${modelId}" not found or unavailable. ${errorMsg} Please check the model name or try a different model.`
490
+ );
491
+ }
492
+ if (errorData?.message) {
493
+ throw new Error(
494
+ `Provider error for model "${modelId}": ${errorData.message}`
495
+ );
496
+ }
497
+ }
498
+ if (llmSpan && telemetry) {
499
+ const latencyMs = Date.now() - startTime;
500
+ telemetry.recordEvent(llmSpan, {
501
+ type: "llm_call",
502
+ model: modelId,
503
+ provider: "unknown",
504
+ // Will be determined by the model
505
+ input: {
506
+ messages: request.messages ?? [{ role: "user", content: typeof request.user === "string" ? request.user : "" }],
507
+ temperature: void 0,
508
+ maxTokens: void 0,
509
+ schema: request.schema
510
+ },
511
+ error: error instanceof Error ? error : new Error(String(error)),
512
+ latencyMs
513
+ });
514
+ telemetry.endSpan(llmSpan, {
515
+ status: "error",
516
+ error: error instanceof Error ? error : new Error(String(error)),
517
+ latencyMs
518
+ });
519
+ }
520
+ throw error;
521
+ }
522
+ const usageRaw = result.usage ?? {};
523
+ const inputTokens = "promptTokens" in usageRaw ? usageRaw.promptTokens : usageRaw.inputTokens ?? 0;
524
+ const outputTokens = "completionTokens" in usageRaw ? usageRaw.completionTokens : usageRaw.outputTokens ?? 0;
525
+ const totalTokens = "totalTokens" in usageRaw ? usageRaw.totalTokens : inputTokens + outputTokens;
526
+ const usage = {
527
+ inputTokens,
528
+ outputTokens,
529
+ totalTokens
530
+ };
531
+ if (llmSpan && telemetry) {
532
+ const latencyMs = Date.now() - startTime;
533
+ telemetry.recordEvent(llmSpan, {
534
+ type: "llm_call",
535
+ model: typeof request.model === "object" && request.model !== null ? request.model.modelId ?? "unknown" : String(request.model),
536
+ provider: preferredProvider ?? "unknown",
537
+ input: {
538
+ messages: request.messages ?? [{ role: "user", content: typeof request.user === "string" ? request.user : "" }],
539
+ temperature: void 0,
540
+ maxTokens: void 0,
541
+ schema: request.schema
542
+ },
543
+ output: {
544
+ content: JSON.stringify(result.output),
545
+ structured: true,
546
+ usage: {
547
+ input: inputTokens,
548
+ output: outputTokens,
549
+ total: totalTokens
550
+ }
551
+ },
552
+ latencyMs
553
+ });
554
+ telemetry.endSpan(llmSpan, {
555
+ status: "ok",
556
+ output: result.output,
557
+ latencyMs
558
+ });
559
+ }
560
+ return { data: result.output, usage };
561
+ };
562
+
563
+ // src/llm/RetryingRunner.ts
564
+ var runWithRetries = async (options) => {
565
+ const { telemetry, parentSpan } = options;
566
+ const retrySpan = telemetry?.startSpan({
567
+ name: "struktur.validation_retry",
568
+ kind: "CHAIN",
569
+ parentSpan,
570
+ attributes: {
571
+ "retry.max_attempts": options.maxAttempts ?? 3,
572
+ "retry.schema_name": options.schemaName ?? "extract"
573
+ }
574
+ });
575
+ const ajv = createAjv();
576
+ const maxAttempts = options.maxAttempts ?? 3;
577
+ const messages = [{ role: "user", content: options.user }];
578
+ const debug = options.debug;
579
+ const callId = options.callId ?? `call_${Date.now()}_${Math.random().toString(36).slice(2, 11)}`;
580
+ let usage = { inputTokens: 0, outputTokens: 0, totalTokens: 0 };
581
+ let lastError;
582
+ const systemLength = options.system.length;
583
+ const userLength = typeof options.user === "string" ? options.user.length : JSON.stringify(options.user).length;
584
+ debug?.llmCallStart({
585
+ callId,
586
+ model: JSON.stringify(options.model),
587
+ schemaName: options.schemaName,
588
+ systemLength,
589
+ userLength,
590
+ artifactCount: Array.isArray(options.user) ? options.user.length : 0
591
+ });
592
+ debug?.promptSystem({ callId, system: options.system });
593
+ debug?.promptUser({ callId, user: options.user });
594
+ for (let attempt = 1; attempt <= maxAttempts; attempt += 1) {
595
+ const executor = options.execute ?? generateStructured;
596
+ const isFinalAttempt = attempt === maxAttempts;
597
+ const useStrictValidation = options.strict === true || isFinalAttempt;
598
+ debug?.validationStart({
599
+ callId,
600
+ attempt,
601
+ maxAttempts,
602
+ strict: useStrictValidation
603
+ });
604
+ const startTime = Date.now();
605
+ const result = await executor({
606
+ model: options.model,
607
+ schema: options.schema,
608
+ schemaName: options.schemaName,
609
+ system: options.system,
610
+ user: options.user,
611
+ messages,
612
+ strict: options.strict,
613
+ telemetry,
614
+ parentSpan: retrySpan
615
+ });
616
+ const durationMs = Date.now() - startTime;
617
+ usage = {
618
+ inputTokens: usage.inputTokens + result.usage.inputTokens,
619
+ outputTokens: usage.outputTokens + result.usage.outputTokens,
620
+ totalTokens: usage.totalTokens + result.usage.totalTokens
621
+ };
622
+ debug?.rawResponse({ callId, response: result.data });
623
+ try {
624
+ if (useStrictValidation) {
625
+ const validated = validateOrThrow(
626
+ ajv,
627
+ options.schema,
628
+ result.data
629
+ );
630
+ debug?.validationSuccess({ callId, attempt });
631
+ debug?.llmCallComplete({
632
+ callId,
633
+ success: true,
634
+ inputTokens: usage.inputTokens,
635
+ outputTokens: usage.outputTokens,
636
+ totalTokens: usage.totalTokens,
637
+ durationMs
638
+ });
639
+ if (retrySpan && telemetry) {
640
+ telemetry.recordEvent(retrySpan, {
641
+ type: "validation",
642
+ attempt,
643
+ maxAttempts,
644
+ schema: options.schema,
645
+ input: result.data,
646
+ success: true,
647
+ latencyMs: durationMs
648
+ });
649
+ telemetry.endSpan(retrySpan, {
650
+ status: "ok",
651
+ output: validated,
652
+ latencyMs: durationMs
653
+ });
654
+ }
655
+ return { data: validated, usage };
656
+ } else {
657
+ const validationResult = validateAllowingMissingRequired(
658
+ ajv,
659
+ options.schema,
660
+ result.data,
661
+ isFinalAttempt
662
+ );
663
+ if (validationResult.valid) {
664
+ debug?.validationSuccess({ callId, attempt });
665
+ debug?.llmCallComplete({
666
+ callId,
667
+ success: true,
668
+ inputTokens: usage.inputTokens,
669
+ outputTokens: usage.outputTokens,
670
+ totalTokens: usage.totalTokens,
671
+ durationMs
672
+ });
673
+ if (retrySpan && telemetry) {
674
+ telemetry.recordEvent(retrySpan, {
675
+ type: "validation",
676
+ attempt,
677
+ maxAttempts,
678
+ schema: options.schema,
679
+ input: result.data,
680
+ success: true,
681
+ latencyMs: durationMs
682
+ });
683
+ telemetry.endSpan(retrySpan, {
684
+ status: "ok",
685
+ output: validationResult.data,
686
+ latencyMs: durationMs
687
+ });
688
+ }
689
+ return { data: validationResult.data, usage };
690
+ }
691
+ throw new SchemaValidationError(
692
+ "Schema validation failed",
693
+ validationResult.errors
694
+ );
695
+ }
696
+ } catch (error) {
697
+ lastError = error;
698
+ if (error instanceof SchemaValidationError) {
699
+ debug?.validationFailed({
700
+ callId,
701
+ attempt,
702
+ errors: error.errors
703
+ });
704
+ if (retrySpan && telemetry) {
705
+ telemetry.recordEvent(retrySpan, {
706
+ type: "validation",
707
+ attempt,
708
+ maxAttempts,
709
+ schema: options.schema,
710
+ input: result.data,
711
+ success: false,
712
+ errors: error.errors,
713
+ latencyMs: durationMs
714
+ });
715
+ }
716
+ const nextAttempt = attempt + 1;
717
+ if (nextAttempt <= maxAttempts) {
718
+ await options.events?.onRetry?.({
719
+ attempt: nextAttempt,
720
+ maxAttempts,
721
+ reason: "schema_validation_failed"
722
+ });
723
+ debug?.retry({
724
+ callId,
725
+ attempt: nextAttempt,
726
+ maxAttempts,
727
+ reason: "schema_validation_failed"
728
+ });
729
+ }
730
+ const errorPayload = JSON.stringify(error.errors, null, 2);
731
+ const errorMessage = `<validation-errors>
732
+ ${errorPayload}
733
+ </validation-errors>`;
734
+ messages.push({ role: "user", content: errorMessage });
735
+ await options.events?.onMessage?.({
736
+ role: "user",
737
+ content: errorMessage
738
+ });
739
+ continue;
740
+ }
741
+ debug?.llmCallComplete({
742
+ callId,
743
+ success: false,
744
+ inputTokens: usage.inputTokens,
745
+ outputTokens: usage.outputTokens,
746
+ totalTokens: usage.totalTokens,
747
+ durationMs,
748
+ error: error.message
749
+ });
750
+ if (retrySpan && telemetry) {
751
+ telemetry.endSpan(retrySpan, {
752
+ status: "error",
753
+ error,
754
+ latencyMs: durationMs
755
+ });
756
+ }
757
+ break;
758
+ }
759
+ }
760
+ throw lastError ?? new Error("Unknown extraction error");
761
+ };
762
+
763
+ // src/strategies/utils.ts
764
+ var serializeSchema = (schema) => {
765
+ return JSON.stringify(schema);
766
+ };
767
+ var mergeUsage = (usages) => {
768
+ return usages.reduce(
769
+ (acc, usage) => ({
770
+ inputTokens: acc.inputTokens + usage.inputTokens,
771
+ outputTokens: acc.outputTokens + usage.outputTokens,
772
+ totalTokens: acc.totalTokens + usage.totalTokens
773
+ }),
774
+ { inputTokens: 0, outputTokens: 0, totalTokens: 0 }
775
+ );
776
+ };
777
+ var getBatches = (artifacts, options, debug, telemetry, parentSpan) => {
778
+ const chunkingSpan = telemetry?.startSpan({
779
+ name: "struktur.chunking",
780
+ kind: "RETRIEVER",
781
+ parentSpan,
782
+ attributes: {
783
+ "chunking.artifact_count": artifacts.length,
784
+ "chunking.max_tokens": options.maxTokens,
785
+ "chunking.max_images": options.maxImages
786
+ }
787
+ });
788
+ const batches = batchArtifacts(artifacts, { ...options, debug });
789
+ if (chunkingSpan && telemetry) {
790
+ batches.forEach((batch, index) => {
791
+ telemetry.recordEvent(chunkingSpan, {
792
+ type: "chunk",
793
+ chunkIndex: index,
794
+ totalChunks: batches.length,
795
+ tokens: batch.reduce((sum, a) => sum + (a.tokens || 0), 0),
796
+ images: batch.reduce((sum, a) => sum + (a.contents?.flatMap((c) => c.media || []).length || 0), 0)
797
+ });
798
+ });
799
+ telemetry.endSpan(chunkingSpan, {
800
+ status: "ok",
801
+ output: { batchCount: batches.length }
802
+ });
803
+ }
804
+ return batches;
805
+ };
806
+ var extractWithPrompt = async (options) => {
807
+ const userContent = buildUserContent(options.user, options.artifacts);
808
+ const result = await runWithRetries({
809
+ model: options.model,
810
+ schema: options.schema,
811
+ system: options.system,
812
+ user: userContent,
813
+ events: options.events,
814
+ execute: options.execute,
815
+ strict: options.strict,
816
+ debug: options.debug,
817
+ callId: options.callId,
818
+ telemetry: options.telemetry,
819
+ parentSpan: options.parentSpan
820
+ });
821
+ return result;
822
+ };
823
+
824
+ // src/strategies/SimpleStrategy.ts
825
+ var SimpleStrategy = class {
826
+ name = "simple";
827
+ config;
828
+ constructor(config) {
829
+ this.config = config;
830
+ }
831
+ getEstimatedSteps() {
832
+ return 3;
833
+ }
834
+ async run(options) {
835
+ const debug = options.debug;
836
+ const telemetry = options.telemetry ?? void 0;
837
+ const strategySpan = telemetry?.startSpan({
838
+ name: "strategy.simple",
839
+ kind: "CHAIN",
840
+ attributes: {
841
+ "strategy.name": this.name,
842
+ "strategy.artifacts.count": options.artifacts.length
843
+ }
844
+ });
845
+ const schema = serializeSchema(options.schema);
846
+ const { system, user } = buildExtractorPrompt(
847
+ options.artifacts,
848
+ schema,
849
+ this.config.outputInstructions
850
+ );
851
+ await options.events?.onStep?.({
852
+ step: 1,
853
+ total: this.getEstimatedSteps(),
854
+ label: "extract"
855
+ });
856
+ debug?.step({
857
+ step: 1,
858
+ total: this.getEstimatedSteps(),
859
+ label: "extract",
860
+ strategy: this.name
861
+ });
862
+ const result = await extractWithPrompt({
863
+ model: this.config.model,
864
+ schema: options.schema,
865
+ system,
866
+ user,
867
+ artifacts: options.artifacts,
868
+ events: options.events,
869
+ execute: this.config.execute,
870
+ strict: options.strict ?? this.config.strict,
871
+ debug,
872
+ callId: "simple_extract",
873
+ telemetry,
874
+ parentSpan: strategySpan
875
+ });
876
+ debug?.step({
877
+ step: 2,
878
+ total: this.getEstimatedSteps(),
879
+ label: "complete",
880
+ strategy: this.name
881
+ });
882
+ telemetry?.endSpan(strategySpan, {
883
+ status: "ok",
884
+ output: result.data
885
+ });
886
+ return { data: result.data, usage: result.usage };
887
+ }
888
+ };
889
+ var simple = (config) => {
890
+ return new SimpleStrategy(config);
891
+ };
892
+
893
+ // src/prompts/ParallelMergerPrompt.ts
894
+ var buildParallelMergerPrompt = (schema, dataList) => {
895
+ const jsonObjects = dataList.filter((item) => item !== null && item !== void 0).map((item) => JSON.stringify(item)).map((json) => `<json-object>${json}</json-object>`).join("\n");
896
+ const system = `You are a data merger. Combine multiple JSON objects into one object matching the provided schema.
897
+
898
+ <thinking>
899
+ Before merging, consider:
900
+ 1. Which input objects contain data for each schema field?
901
+ 2. How should conflicting values be resolved (prefer more complete/recent data)?
902
+ 3. Are there arrays that need to be concatenated vs deduplicated?
903
+ 4. Ensure NO information is lost from any input
904
+ </thinking>
905
+
906
+ <rules>
907
+ - Produce a single JSON object following the schema exactly
908
+ - Combine all information from input objects without losing data
909
+ - Resolve conflicts intelligently (prefer richer/more specific data)
910
+ - Output ONLY valid JSON - no markdown, no explanations
911
+ </rules>`;
912
+ const user = `<json-schema>
913
+ ${schema}
914
+ </json-schema>
915
+
916
+ <json-objects>
917
+ ${jsonObjects}
918
+ </json-objects>`;
919
+ return { system, user };
920
+ };
921
+
922
+ // src/strategies/concurrency.ts
923
+ var runConcurrently = async (tasks, concurrency) => {
924
+ const results = [];
925
+ for (let i = 0; i < tasks.length; i += concurrency) {
926
+ const chunk = tasks.slice(i, i + concurrency).map((task) => task());
927
+ const chunkResults = await Promise.all(chunk);
928
+ results.push(...chunkResults);
929
+ }
930
+ return results;
931
+ };
932
+
933
+ // src/strategies/ParallelStrategy.ts
934
+ var ParallelStrategy = class {
935
+ name = "parallel";
936
+ config;
937
+ constructor(config) {
938
+ this.config = config;
939
+ }
940
+ getEstimatedSteps(artifacts) {
941
+ const batches = getBatches(artifacts, {
942
+ maxTokens: this.config.chunkSize,
943
+ maxImages: this.config.maxImages
944
+ });
945
+ return batches.length + 3;
946
+ }
947
+ async run(options) {
948
+ const debug = options.debug;
949
+ const { telemetry } = options;
950
+ const strategySpan = telemetry?.startSpan({
951
+ name: "strategy.parallel",
952
+ kind: "CHAIN",
953
+ attributes: {
954
+ "strategy.name": this.name,
955
+ "strategy.artifacts.count": options.artifacts.length,
956
+ "strategy.chunk_size": this.config.chunkSize,
957
+ "strategy.concurrency": this.config.concurrency
958
+ }
959
+ });
960
+ const batches = getBatches(
961
+ options.artifacts,
962
+ {
963
+ maxTokens: this.config.chunkSize,
964
+ maxImages: this.config.maxImages
965
+ },
966
+ debug,
967
+ telemetry ?? void 0,
968
+ strategySpan
969
+ );
970
+ const schema = serializeSchema(options.schema);
971
+ const totalSteps = this.getEstimatedSteps(options.artifacts);
972
+ let step = 1;
973
+ await options.events?.onStep?.({
974
+ step,
975
+ total: totalSteps,
976
+ label: batches.length > 1 ? `batch 1/${batches.length}` : "extract"
977
+ });
978
+ debug?.step({
979
+ step,
980
+ total: totalSteps,
981
+ label: batches.length > 1 ? `batch 1/${batches.length}` : "extract",
982
+ strategy: this.name
983
+ });
984
+ const tasks = batches.map((batch, index) => async () => {
985
+ const prompt = buildExtractorPrompt(
986
+ batch,
987
+ schema,
988
+ this.config.outputInstructions
989
+ );
990
+ const result = await extractWithPrompt({
991
+ model: this.config.model,
992
+ schema: options.schema,
993
+ system: prompt.system,
994
+ user: prompt.user,
995
+ artifacts: batch,
996
+ events: options.events,
997
+ execute: this.config.execute,
998
+ strict: options.strict ?? this.config.strict,
999
+ debug,
1000
+ callId: `parallel_batch_${index + 1}`,
1001
+ telemetry: telemetry ?? void 0,
1002
+ parentSpan: strategySpan
1003
+ });
1004
+ const completedIndex = index + 1;
1005
+ if (completedIndex < batches.length) {
1006
+ step += 1;
1007
+ await options.events?.onStep?.({
1008
+ step,
1009
+ total: totalSteps,
1010
+ label: `batch ${completedIndex + 1}/${batches.length}`
1011
+ });
1012
+ debug?.step({
1013
+ step,
1014
+ total: totalSteps,
1015
+ label: `batch ${completedIndex + 1}/${batches.length}`,
1016
+ strategy: this.name
1017
+ });
1018
+ }
1019
+ return result;
1020
+ });
1021
+ const results = await runConcurrently(
1022
+ tasks,
1023
+ this.config.concurrency ?? batches.length
1024
+ );
1025
+ debug?.mergeStart({
1026
+ mergeId: "parallel_merge",
1027
+ inputCount: results.length,
1028
+ strategy: this.name
1029
+ });
1030
+ const mergeSpan = telemetry?.startSpan({
1031
+ name: "struktur.merge",
1032
+ kind: "CHAIN",
1033
+ parentSpan: strategySpan,
1034
+ attributes: {
1035
+ "merge.strategy": "parallel",
1036
+ "merge.input_count": results.length
1037
+ }
1038
+ });
1039
+ const mergePrompt = buildParallelMergerPrompt(
1040
+ schema,
1041
+ results.map((r) => r.data)
1042
+ );
1043
+ const merged = await extractWithPrompt({
1044
+ model: this.config.mergeModel,
1045
+ schema: options.schema,
1046
+ system: mergePrompt.system,
1047
+ user: mergePrompt.user,
1048
+ artifacts: [],
1049
+ events: options.events,
1050
+ execute: this.config.execute,
1051
+ strict: this.config.strict,
1052
+ debug,
1053
+ callId: "parallel_merge",
1054
+ telemetry: telemetry ?? void 0,
1055
+ parentSpan: mergeSpan
1056
+ });
1057
+ step += 1;
1058
+ await options.events?.onStep?.({
1059
+ step,
1060
+ total: totalSteps,
1061
+ label: "merge"
1062
+ });
1063
+ debug?.step({
1064
+ step,
1065
+ total: totalSteps,
1066
+ label: "merge",
1067
+ strategy: this.name
1068
+ });
1069
+ debug?.mergeComplete({ mergeId: "parallel_merge", success: true });
1070
+ if (mergeSpan && telemetry) {
1071
+ telemetry.recordEvent(mergeSpan, {
1072
+ type: "merge",
1073
+ strategy: "parallel",
1074
+ inputCount: results.length,
1075
+ outputCount: 1
1076
+ });
1077
+ telemetry.endSpan(mergeSpan, {
1078
+ status: "ok",
1079
+ output: merged.data
1080
+ });
1081
+ }
1082
+ telemetry?.endSpan(strategySpan, {
1083
+ status: "ok",
1084
+ output: merged.data
1085
+ });
1086
+ return {
1087
+ data: merged.data,
1088
+ usage: mergeUsage([...results.map((r) => r.usage), merged.usage])
1089
+ };
1090
+ }
1091
+ };
1092
+ var parallel = (config) => {
1093
+ return new ParallelStrategy(config);
1094
+ };
1095
+
1096
+ // src/prompts/SequentialExtractorPrompt.ts
1097
+ var sequentialSystemPrompt = (schema, outputInstructions) => {
1098
+ return `<instructions>
1099
+ You are a precise data extraction engine. Extract data from provided artifacts according to the JSON schema, enriching any previous data you receive.
1100
+
1101
+ <thinking>
1102
+ Before extracting, consider:
1103
+ 1. Review previous data - what needs to be preserved vs enriched?
1104
+ 2. Which new fields have clear values in the artifacts?
1105
+ 3. Which fields remain missing or unclear (keep null from previous or set to null)?
1106
+ 4. Can new information improve the structure of existing data?
1107
+ 5. Ensure NO information is lost from previous data
1108
+ </thinking>
1109
+
1110
+ <rules>
1111
+ - Merge new artifacts into existing data - do not create fresh objects
1112
+ - Preserve ALL previous data - losing information breaks the processing chain
1113
+ - Use null for missing/uncertain values in new fields
1114
+ - Only extract information explicitly present in the artifacts
1115
+ - Output ONLY valid JSON matching the schema
1116
+ - No markdown, explanations, or code fences
1117
+ </rules>
1118
+
1119
+ <image-handling>
1120
+ Some schema properties may reference artifact IDs (e.g., 'xxx_artifact_id' fields).
1121
+ When assigning images to properties:
1122
+ - Use format: artifact:ID/images/imageNUM.EXT (e.g., 'artifact:123456/images/image1.jpg')
1123
+ - Only reference images you can actually see in the provided documents/images
1124
+ - Image references are visible in artifact XML or written on images
1125
+ - NEVER make up artifact IDs or use normal URLs
1126
+ </image-handling>
1127
+
1128
+ <output-instructions>
1129
+ ${outputInstructions ?? "No additional output instructions provided."}
1130
+ </output-instructions>
1131
+
1132
+ <json-schema>
1133
+ ${schema}
1134
+ </json-schema>
1135
+
1136
+ <how-to-output>
1137
+ Return the complete extracted data as valid JSON matching the schema.
1138
+ Include all information from previous data, enriched with the new artifacts.
1139
+ </how-to-output>
1140
+ </instructions>`;
1141
+ };
1142
+ var sequentialUserPrompt = (artifactsXml, previousData, outputInstructions) => {
1143
+ return `${artifactsXml}
1144
+
1145
+ <previous-data>
1146
+ ${previousData}
1147
+ </previous-data>
1148
+
1149
+ <task>
1150
+ Extract the contents of the given artifacts and ADD/MERGE them into the previous data contained in the <previous-data> tag.
1151
+ You MUST NOT lose any information from the previous data. All previous data must be included in your response.
1152
+ </task>
1153
+
1154
+ <output-instructions>
1155
+ ${outputInstructions ?? ""}
1156
+ </output-instructions>`;
1157
+ };
1158
+ var buildSequentialPrompt = (artifacts, schema, previousData, outputInstructions) => {
1159
+ const artifactsXml = formatArtifactsXml(artifacts);
1160
+ return {
1161
+ system: sequentialSystemPrompt(schema, outputInstructions),
1162
+ user: sequentialUserPrompt(artifactsXml, previousData, outputInstructions)
1163
+ };
1164
+ };
1165
+
1166
+ // src/strategies/SequentialStrategy.ts
1167
+ var SequentialStrategy = class {
1168
+ name = "sequential";
1169
+ config;
1170
+ constructor(config) {
1171
+ this.config = config;
1172
+ }
1173
+ getEstimatedSteps(artifacts) {
1174
+ const batches = getBatches(artifacts, {
1175
+ maxTokens: this.config.chunkSize,
1176
+ maxImages: this.config.maxImages
1177
+ });
1178
+ return batches.length + 2;
1179
+ }
1180
+ async run(options) {
1181
+ const debug = options.debug;
1182
+ const { telemetry } = options;
1183
+ const strategySpan = telemetry?.startSpan({
1184
+ name: "strategy.sequential",
1185
+ kind: "CHAIN",
1186
+ attributes: {
1187
+ "strategy.name": this.name,
1188
+ "strategy.artifacts.count": options.artifacts.length,
1189
+ "strategy.chunk_size": this.config.chunkSize
1190
+ }
1191
+ });
1192
+ const batches = getBatches(
1193
+ options.artifacts,
1194
+ {
1195
+ maxTokens: this.config.chunkSize,
1196
+ maxImages: this.config.maxImages
1197
+ },
1198
+ debug,
1199
+ telemetry ?? void 0,
1200
+ strategySpan
1201
+ );
1202
+ const schema = serializeSchema(options.schema);
1203
+ let currentData;
1204
+ const usages = [];
1205
+ const totalSteps = this.getEstimatedSteps(options.artifacts);
1206
+ let step = 1;
1207
+ await options.events?.onStep?.({
1208
+ step,
1209
+ total: totalSteps,
1210
+ label: batches.length > 1 ? `batch 1/${batches.length}` : "extract"
1211
+ });
1212
+ debug?.step({
1213
+ step,
1214
+ total: totalSteps,
1215
+ label: batches.length > 1 ? `batch 1/${batches.length}` : "extract",
1216
+ strategy: this.name
1217
+ });
1218
+ for (const [index, batch] of batches.entries()) {
1219
+ const previousData = currentData ? JSON.stringify(currentData) : "{}";
1220
+ const prompt = buildSequentialPrompt(
1221
+ batch,
1222
+ schema,
1223
+ previousData,
1224
+ this.config.outputInstructions
1225
+ );
1226
+ const result = await extractWithPrompt({
1227
+ model: this.config.model,
1228
+ schema: options.schema,
1229
+ system: prompt.system,
1230
+ user: prompt.user,
1231
+ artifacts: batch,
1232
+ events: options.events,
1233
+ execute: this.config.execute,
1234
+ strict: options.strict ?? this.config.strict,
1235
+ debug,
1236
+ callId: `sequential_batch_${index + 1}`,
1237
+ telemetry: telemetry ?? void 0,
1238
+ parentSpan: strategySpan
1239
+ });
1240
+ currentData = result.data;
1241
+ usages.push(result.usage);
1242
+ step += 1;
1243
+ if (index < batches.length - 1) {
1244
+ await options.events?.onStep?.({
1245
+ step,
1246
+ total: totalSteps,
1247
+ label: `batch ${index + 2}/${batches.length}`
1248
+ });
1249
+ debug?.step({
1250
+ step,
1251
+ total: totalSteps,
1252
+ label: `batch ${index + 2}/${batches.length}`,
1253
+ strategy: this.name
1254
+ });
1255
+ }
1256
+ }
1257
+ if (!currentData) {
1258
+ throw new Error("No data extracted from sequential strategy");
1259
+ }
1260
+ telemetry?.endSpan(strategySpan, {
1261
+ status: "ok",
1262
+ output: currentData
1263
+ });
1264
+ return { data: currentData, usage: mergeUsage(usages) };
1265
+ }
1266
+ };
1267
+ var sequential = (config) => {
1268
+ return new SequentialStrategy(config);
1269
+ };
1270
+
1271
+ // src/prompts/DeduplicationPrompt.ts
1272
+ var buildDeduplicationPrompt = (schema, data, exampleKeys = ["items.3", "items.5"]) => {
1273
+ const system = `You are a deduplication engine. Identify duplicate entries in structured data.
1274
+
1275
+ <thinking>
1276
+ Before deduplicating, consider:
1277
+ 1. Which fields indicate uniqueness for each entity type?
1278
+ 2. Are entries duplicates if they share key fields but differ in minor details?
1279
+ 3. Which entry should be kept (prefer more complete data)?
1280
+ </thinking>
1281
+
1282
+ <rules>
1283
+ - Identify entries that represent the same entity
1284
+ - Return paths to duplicates using dot notation (e.g., "items.3", "items.5")
1285
+ - Output ONLY JSON in format: { "keys": ["path1", "path2"] }
1286
+ - No markdown, no explanations
1287
+ </rules>`;
1288
+ const user = `<json-schema>
1289
+ ${schema}
1290
+ </json-schema>
1291
+
1292
+ <json-data>
1293
+ ${JSON.stringify(data)}
1294
+ </json-data>
1295
+
1296
+ <task>Identify duplicate entries in the data and return their paths in the format: { "keys": ["path1", "path2"] }</task>
1297
+
1298
+ <example>
1299
+ If items at indices 3 and 5 are duplicates, return: { "keys": ["items.3", "items.5"] }
1300
+ </example>`;
1301
+ return { system, user };
1302
+ };
1303
+
1304
+ // src/merge/SmartDataMerger.ts
1305
+ var isArraySchema = (schema) => {
1306
+ if (schema.type === "array") {
1307
+ return true;
1308
+ }
1309
+ return false;
1310
+ };
1311
+ var isObjectSchema = (schema) => {
1312
+ return schema.type === "object" && typeof schema.properties === "object";
1313
+ };
1314
+ var SmartDataMerger = class {
1315
+ schema;
1316
+ constructor(schema) {
1317
+ this.schema = schema;
1318
+ }
1319
+ merge(currentData, newData) {
1320
+ const merged = { ...currentData };
1321
+ const properties = this.schema.properties ?? {};
1322
+ for (const [key, propSchema] of Object.entries(properties)) {
1323
+ const currentValue = currentData[key];
1324
+ const newValue = newData[key];
1325
+ if (isArraySchema(propSchema)) {
1326
+ merged[key] = [
1327
+ ...Array.isArray(currentValue) ? currentValue : [],
1328
+ ...Array.isArray(newValue) ? newValue : []
1329
+ ];
1330
+ continue;
1331
+ }
1332
+ if (isObjectSchema(propSchema)) {
1333
+ merged[key] = {
1334
+ ...typeof currentValue === "object" && currentValue ? currentValue : {},
1335
+ ...typeof newValue === "object" && newValue ? newValue : {}
1336
+ };
1337
+ continue;
1338
+ }
1339
+ if (newValue !== void 0 && newValue !== null && newValue !== "") {
1340
+ merged[key] = newValue;
1341
+ } else if (currentValue !== void 0) {
1342
+ merged[key] = currentValue;
1343
+ }
1344
+ }
1345
+ return merged;
1346
+ }
1347
+ };
1348
+
1349
+ // src/merge/Deduplicator.ts
1350
+ var fnv1a32 = (str) => {
1351
+ let hash = 2166136261;
1352
+ for (let i = 0; i < str.length; i++) {
1353
+ hash ^= str.charCodeAt(i);
1354
+ hash = Math.imul(hash, 16777619);
1355
+ }
1356
+ return hash >>> 0;
1357
+ };
1358
+ var stableStringify = (value) => {
1359
+ if (value === null || typeof value !== "object") {
1360
+ return JSON.stringify(value);
1361
+ }
1362
+ if (Array.isArray(value)) {
1363
+ return `[${value.map((item) => stableStringify(item)).join(",")}]`;
1364
+ }
1365
+ const entries = Object.entries(value).sort(([a], [b]) => a.localeCompare(b)).map(([key, val]) => `"${key}":${stableStringify(val)}`);
1366
+ return `{${entries.join(",")}}`;
1367
+ };
1368
+ var findExactDuplicatesWithHashing = (items) => {
1369
+ const seen = /* @__PURE__ */ new Map();
1370
+ const duplicates = [];
1371
+ items.forEach((item, index) => {
1372
+ const hash = fnv1a32(stableStringify(item));
1373
+ if (seen.has(hash)) {
1374
+ duplicates.push(index);
1375
+ return;
1376
+ }
1377
+ seen.set(hash, index);
1378
+ });
1379
+ return duplicates;
1380
+ };
1381
+ var deduplicateByIndices = (items, indices) => {
1382
+ const remove = new Set(indices);
1383
+ return items.filter((_, index) => !remove.has(index));
1384
+ };
1385
+
1386
+ // src/strategies/ParallelAutoMergeStrategy.ts
1387
+ var dedupeSchema = {
1388
+ type: "object",
1389
+ properties: {
1390
+ keys: { type: "array", items: { type: "string" } }
1391
+ },
1392
+ required: ["keys"],
1393
+ additionalProperties: false
1394
+ };
1395
+ var dedupeArrays = (data) => {
1396
+ const result = { ...data };
1397
+ for (const [key, value] of Object.entries(result)) {
1398
+ if (Array.isArray(value)) {
1399
+ const duplicates = findExactDuplicatesWithHashing(value);
1400
+ result[key] = deduplicateByIndices(value, duplicates);
1401
+ }
1402
+ }
1403
+ return result;
1404
+ };
1405
+ var removeByPath = (data, path) => {
1406
+ const [root, indexStr] = path.split(".");
1407
+ const index = Number(indexStr);
1408
+ if (!root || Number.isNaN(index)) {
1409
+ return data;
1410
+ }
1411
+ const value = data[root];
1412
+ if (!Array.isArray(value)) {
1413
+ return data;
1414
+ }
1415
+ const next = [...value];
1416
+ next.splice(index, 1);
1417
+ return { ...data, [root]: next };
1418
+ };
1419
+ var ParallelAutoMergeStrategy = class {
1420
+ name = "parallel-auto-merge";
1421
+ config;
1422
+ constructor(config) {
1423
+ this.config = config;
1424
+ }
1425
+ getEstimatedSteps(artifacts) {
1426
+ const batches = getBatches(artifacts, {
1427
+ maxTokens: this.config.chunkSize,
1428
+ maxImages: this.config.maxImages
1429
+ });
1430
+ return batches.length + 3;
1431
+ }
1432
+ async run(options) {
1433
+ const debug = options.debug;
1434
+ const { telemetry } = options;
1435
+ const strategySpan = telemetry?.startSpan({
1436
+ name: "strategy.parallel-auto-merge",
1437
+ kind: "CHAIN",
1438
+ attributes: {
1439
+ "strategy.name": this.name,
1440
+ "strategy.artifacts.count": options.artifacts.length,
1441
+ "strategy.chunk_size": this.config.chunkSize,
1442
+ "strategy.concurrency": this.config.concurrency
1443
+ }
1444
+ });
1445
+ const batches = getBatches(
1446
+ options.artifacts,
1447
+ {
1448
+ maxTokens: this.config.chunkSize,
1449
+ maxImages: this.config.maxImages
1450
+ },
1451
+ debug,
1452
+ telemetry ?? void 0,
1453
+ strategySpan
1454
+ );
1455
+ const schema = serializeSchema(options.schema);
1456
+ const totalSteps = this.getEstimatedSteps(options.artifacts);
1457
+ let step = 1;
1458
+ const tasks = batches.map((batch, index) => async () => {
1459
+ const prompt = buildExtractorPrompt(
1460
+ batch,
1461
+ schema,
1462
+ this.config.outputInstructions
1463
+ );
1464
+ const result = await extractWithPrompt({
1465
+ model: this.config.model,
1466
+ schema: options.schema,
1467
+ system: prompt.system,
1468
+ user: prompt.user,
1469
+ artifacts: batch,
1470
+ events: options.events,
1471
+ execute: this.config.execute,
1472
+ strict: options.strict ?? this.config.strict,
1473
+ debug,
1474
+ callId: `parallel_auto_batch_${index + 1}`,
1475
+ telemetry: telemetry ?? void 0,
1476
+ parentSpan: strategySpan
1477
+ });
1478
+ step += 1;
1479
+ await options.events?.onStep?.({
1480
+ step,
1481
+ total: totalSteps,
1482
+ label: `batch ${index + 1}/${batches.length}`
1483
+ });
1484
+ debug?.step({
1485
+ step,
1486
+ total: totalSteps,
1487
+ label: `batch ${index + 1}/${batches.length}`,
1488
+ strategy: this.name
1489
+ });
1490
+ return result;
1491
+ });
1492
+ const results = await runConcurrently(
1493
+ tasks,
1494
+ this.config.concurrency ?? batches.length
1495
+ );
1496
+ const merger = new SmartDataMerger(
1497
+ options.schema
1498
+ );
1499
+ let merged = {};
1500
+ debug?.mergeStart({
1501
+ mergeId: "parallel_auto_smart_merge",
1502
+ inputCount: results.length,
1503
+ strategy: this.name
1504
+ });
1505
+ const mergeSpan = telemetry?.startSpan({
1506
+ name: "struktur.smart_merge",
1507
+ kind: "CHAIN",
1508
+ parentSpan: strategySpan,
1509
+ attributes: {
1510
+ "merge.strategy": "smart",
1511
+ "merge.input_count": results.length
1512
+ }
1513
+ });
1514
+ for (let i = 0; i < results.length; i++) {
1515
+ const result = results[i];
1516
+ const prevSize = Object.keys(merged).length;
1517
+ merged = merger.merge(merged, result.data);
1518
+ const newSize = Object.keys(merged).length;
1519
+ for (const key of Object.keys(result.data)) {
1520
+ const leftArray = Array.isArray(merged[key]) ? merged[key].length : void 0;
1521
+ const rightArray = Array.isArray(
1522
+ result.data[key]
1523
+ ) ? result.data[key].length : void 0;
1524
+ debug?.smartMergeField({
1525
+ mergeId: "parallel_auto_smart_merge",
1526
+ field: key,
1527
+ operation: "merge_arrays",
1528
+ leftCount: leftArray,
1529
+ rightCount: rightArray
1530
+ });
1531
+ if (mergeSpan && telemetry) {
1532
+ telemetry.recordEvent(mergeSpan, {
1533
+ type: "merge",
1534
+ strategy: "smart",
1535
+ inputCount: rightArray ?? 1,
1536
+ outputCount: leftArray ?? 1
1537
+ });
1538
+ }
1539
+ }
1540
+ }
1541
+ debug?.mergeComplete({
1542
+ mergeId: "parallel_auto_smart_merge",
1543
+ success: true
1544
+ });
1545
+ if (mergeSpan && telemetry) {
1546
+ telemetry.endSpan(mergeSpan, {
1547
+ status: "ok",
1548
+ output: merged
1549
+ });
1550
+ }
1551
+ merged = dedupeArrays(merged);
1552
+ const exactDedupeSpan = telemetry?.startSpan({
1553
+ name: "struktur.exact_dedupe",
1554
+ kind: "CHAIN",
1555
+ parentSpan: strategySpan,
1556
+ attributes: {
1557
+ "dedupe.method": "exact_hashing"
1558
+ }
1559
+ });
1560
+ if (exactDedupeSpan && telemetry) {
1561
+ telemetry.recordEvent(exactDedupeSpan, {
1562
+ type: "merge",
1563
+ strategy: "exact_hash_dedupe",
1564
+ inputCount: Object.keys(merged).length,
1565
+ outputCount: Object.keys(merged).length
1566
+ });
1567
+ telemetry.endSpan(exactDedupeSpan, {
1568
+ status: "ok",
1569
+ output: merged
1570
+ });
1571
+ }
1572
+ const dedupePrompt = buildDeduplicationPrompt(schema, merged);
1573
+ debug?.dedupeStart({
1574
+ dedupeId: "parallel_auto_dedupe",
1575
+ itemCount: Object.keys(merged).length
1576
+ });
1577
+ const llmDedupeSpan = telemetry?.startSpan({
1578
+ name: "struktur.llm_dedupe",
1579
+ kind: "CHAIN",
1580
+ parentSpan: strategySpan,
1581
+ attributes: {
1582
+ "dedupe.method": "llm"
1583
+ }
1584
+ });
1585
+ const dedupeResponse = await runWithRetries({
1586
+ model: this.config.dedupeModel ?? this.config.model,
1587
+ schema: dedupeSchema,
1588
+ system: dedupePrompt.system,
1589
+ user: dedupePrompt.user,
1590
+ events: options.events,
1591
+ execute: this.config.dedupeExecute,
1592
+ strict: this.config.strict,
1593
+ debug,
1594
+ callId: "parallel_auto_dedupe",
1595
+ telemetry: telemetry ?? void 0,
1596
+ parentSpan: llmDedupeSpan
1597
+ });
1598
+ step += 1;
1599
+ await options.events?.onStep?.({
1600
+ step,
1601
+ total: totalSteps,
1602
+ label: "dedupe"
1603
+ });
1604
+ debug?.step({
1605
+ step,
1606
+ total: totalSteps,
1607
+ label: "dedupe",
1608
+ strategy: this.name
1609
+ });
1610
+ let deduped = merged;
1611
+ for (const key of dedupeResponse.data.keys) {
1612
+ deduped = removeByPath(deduped, key);
1613
+ }
1614
+ debug?.dedupeComplete({
1615
+ dedupeId: "parallel_auto_dedupe",
1616
+ duplicatesFound: dedupeResponse.data.keys.length,
1617
+ itemsRemoved: dedupeResponse.data.keys.length
1618
+ });
1619
+ if (llmDedupeSpan && telemetry) {
1620
+ telemetry.recordEvent(llmDedupeSpan, {
1621
+ type: "merge",
1622
+ strategy: "llm_dedupe",
1623
+ inputCount: Object.keys(merged).length,
1624
+ outputCount: Object.keys(deduped).length,
1625
+ deduped: dedupeResponse.data.keys.length
1626
+ });
1627
+ telemetry.endSpan(llmDedupeSpan, {
1628
+ status: "ok",
1629
+ output: deduped
1630
+ });
1631
+ }
1632
+ telemetry?.endSpan(strategySpan, {
1633
+ status: "ok",
1634
+ output: deduped
1635
+ });
1636
+ return {
1637
+ data: deduped,
1638
+ usage: mergeUsage([...results.map((r) => r.usage), dedupeResponse.usage])
1639
+ };
1640
+ }
1641
+ };
1642
+ var parallelAutoMerge = (config) => {
1643
+ return new ParallelAutoMergeStrategy(config);
1644
+ };
1645
+
1646
+ // src/strategies/SequentialAutoMergeStrategy.ts
1647
+ var dedupeSchema2 = {
1648
+ type: "object",
1649
+ properties: {
1650
+ keys: { type: "array", items: { type: "string" } }
1651
+ },
1652
+ required: ["keys"],
1653
+ additionalProperties: false
1654
+ };
1655
+ var dedupeArrays2 = (data) => {
1656
+ const result = { ...data };
1657
+ for (const [key, value] of Object.entries(result)) {
1658
+ if (Array.isArray(value)) {
1659
+ const duplicates = findExactDuplicatesWithHashing(value);
1660
+ result[key] = deduplicateByIndices(value, duplicates);
1661
+ }
1662
+ }
1663
+ return result;
1664
+ };
1665
+ var removeByPath2 = (data, path) => {
1666
+ const [root, indexStr] = path.split(".");
1667
+ const index = Number(indexStr);
1668
+ if (!root || Number.isNaN(index)) {
1669
+ return data;
1670
+ }
1671
+ const value = data[root];
1672
+ if (!Array.isArray(value)) {
1673
+ return data;
1674
+ }
1675
+ const next = [...value];
1676
+ next.splice(index, 1);
1677
+ return { ...data, [root]: next };
1678
+ };
1679
+ var SequentialAutoMergeStrategy = class {
1680
+ name = "sequential-auto-merge";
1681
+ config;
1682
+ constructor(config) {
1683
+ this.config = config;
1684
+ }
1685
+ getEstimatedSteps(artifacts) {
1686
+ const batches = getBatches(artifacts, {
1687
+ maxTokens: this.config.chunkSize,
1688
+ maxImages: this.config.maxImages
1689
+ });
1690
+ return batches.length + 3;
1691
+ }
1692
+ async run(options) {
1693
+ const debug = options.debug;
1694
+ const { telemetry } = options;
1695
+ const strategySpan = telemetry?.startSpan({
1696
+ name: "strategy.sequential-auto-merge",
1697
+ kind: "CHAIN",
1698
+ attributes: {
1699
+ "strategy.name": this.name,
1700
+ "strategy.artifacts.count": options.artifacts.length,
1701
+ "strategy.chunk_size": this.config.chunkSize
1702
+ }
1703
+ });
1704
+ const batches = getBatches(
1705
+ options.artifacts,
1706
+ {
1707
+ maxTokens: this.config.chunkSize,
1708
+ maxImages: this.config.maxImages
1709
+ },
1710
+ debug,
1711
+ telemetry ?? void 0,
1712
+ strategySpan
1713
+ );
1714
+ const schema = serializeSchema(options.schema);
1715
+ const merger = new SmartDataMerger(
1716
+ options.schema
1717
+ );
1718
+ let merged = {};
1719
+ const usages = [];
1720
+ const totalSteps = this.getEstimatedSteps(options.artifacts);
1721
+ let step = 1;
1722
+ debug?.mergeStart({
1723
+ mergeId: "sequential_auto_merge",
1724
+ inputCount: batches.length,
1725
+ strategy: this.name
1726
+ });
1727
+ const mergeSpan = telemetry?.startSpan({
1728
+ name: "struktur.smart_merge",
1729
+ kind: "CHAIN",
1730
+ parentSpan: strategySpan,
1731
+ attributes: {
1732
+ "merge.strategy": "smart",
1733
+ "merge.input_count": batches.length
1734
+ }
1735
+ });
1736
+ for (const [index, batch] of batches.entries()) {
1737
+ const prompt = buildExtractorPrompt(
1738
+ batch,
1739
+ schema,
1740
+ this.config.outputInstructions
1741
+ );
1742
+ const result = await extractWithPrompt({
1743
+ model: this.config.model,
1744
+ schema: options.schema,
1745
+ system: prompt.system,
1746
+ user: prompt.user,
1747
+ artifacts: batch,
1748
+ events: options.events,
1749
+ execute: this.config.execute,
1750
+ strict: options.strict ?? this.config.strict,
1751
+ debug,
1752
+ callId: `sequential_auto_batch_${index + 1}`,
1753
+ telemetry: telemetry ?? void 0,
1754
+ parentSpan: mergeSpan
1755
+ });
1756
+ merged = merger.merge(merged, result.data);
1757
+ usages.push(result.usage);
1758
+ for (const key of Object.keys(result.data)) {
1759
+ const leftArray = Array.isArray(merged[key]) ? merged[key].length : void 0;
1760
+ const rightArray = Array.isArray(
1761
+ result.data[key]
1762
+ ) ? result.data[key].length : void 0;
1763
+ debug?.smartMergeField({
1764
+ mergeId: "sequential_auto_merge",
1765
+ field: key,
1766
+ operation: "merge_arrays",
1767
+ leftCount: leftArray,
1768
+ rightCount: rightArray
1769
+ });
1770
+ if (mergeSpan && telemetry) {
1771
+ telemetry.recordEvent(mergeSpan, {
1772
+ type: "merge",
1773
+ strategy: "smart",
1774
+ inputCount: rightArray ?? 1,
1775
+ outputCount: leftArray ?? 1
1776
+ });
1777
+ }
1778
+ }
1779
+ step += 1;
1780
+ await options.events?.onStep?.({
1781
+ step,
1782
+ total: totalSteps,
1783
+ label: `batch ${index + 1}/${batches.length}`
1784
+ });
1785
+ debug?.step({
1786
+ step,
1787
+ total: totalSteps,
1788
+ label: `batch ${index + 1}/${batches.length}`,
1789
+ strategy: this.name
1790
+ });
1791
+ }
1792
+ debug?.mergeComplete({ mergeId: "sequential_auto_merge", success: true });
1793
+ if (mergeSpan && telemetry) {
1794
+ telemetry.endSpan(mergeSpan, {
1795
+ status: "ok",
1796
+ output: merged
1797
+ });
1798
+ }
1799
+ merged = dedupeArrays2(merged);
1800
+ const exactDedupeSpan = telemetry?.startSpan({
1801
+ name: "struktur.exact_dedupe",
1802
+ kind: "CHAIN",
1803
+ parentSpan: strategySpan,
1804
+ attributes: {
1805
+ "dedupe.method": "exact_hashing"
1806
+ }
1807
+ });
1808
+ if (exactDedupeSpan && telemetry) {
1809
+ telemetry.recordEvent(exactDedupeSpan, {
1810
+ type: "merge",
1811
+ strategy: "exact_hash_dedupe",
1812
+ inputCount: Object.keys(merged).length,
1813
+ outputCount: Object.keys(merged).length
1814
+ });
1815
+ telemetry.endSpan(exactDedupeSpan, {
1816
+ status: "ok",
1817
+ output: merged
1818
+ });
1819
+ }
1820
+ const dedupePrompt = buildDeduplicationPrompt(schema, merged);
1821
+ debug?.dedupeStart({
1822
+ dedupeId: "sequential_auto_dedupe",
1823
+ itemCount: Object.keys(merged).length
1824
+ });
1825
+ const llmDedupeSpan = telemetry?.startSpan({
1826
+ name: "struktur.llm_dedupe",
1827
+ kind: "CHAIN",
1828
+ parentSpan: strategySpan,
1829
+ attributes: {
1830
+ "dedupe.method": "llm"
1831
+ }
1832
+ });
1833
+ const dedupeResponse = await runWithRetries({
1834
+ model: this.config.dedupeModel ?? this.config.model,
1835
+ schema: dedupeSchema2,
1836
+ system: dedupePrompt.system,
1837
+ user: dedupePrompt.user,
1838
+ events: options.events,
1839
+ execute: this.config.dedupeExecute,
1840
+ strict: this.config.strict,
1841
+ debug,
1842
+ callId: "sequential_auto_dedupe",
1843
+ telemetry: telemetry ?? void 0,
1844
+ parentSpan: llmDedupeSpan
1845
+ });
1846
+ step += 1;
1847
+ await options.events?.onStep?.({
1848
+ step,
1849
+ total: totalSteps,
1850
+ label: "dedupe"
1851
+ });
1852
+ debug?.step({
1853
+ step,
1854
+ total: totalSteps,
1855
+ label: "dedupe",
1856
+ strategy: this.name
1857
+ });
1858
+ let deduped = merged;
1859
+ for (const key of dedupeResponse.data.keys) {
1860
+ deduped = removeByPath2(deduped, key);
1861
+ }
1862
+ debug?.dedupeComplete({
1863
+ dedupeId: "sequential_auto_dedupe",
1864
+ duplicatesFound: dedupeResponse.data.keys.length,
1865
+ itemsRemoved: dedupeResponse.data.keys.length
1866
+ });
1867
+ if (llmDedupeSpan && telemetry) {
1868
+ telemetry.recordEvent(llmDedupeSpan, {
1869
+ type: "merge",
1870
+ strategy: "llm_dedupe",
1871
+ inputCount: Object.keys(merged).length,
1872
+ outputCount: Object.keys(deduped).length,
1873
+ deduped: dedupeResponse.data.keys.length
1874
+ });
1875
+ telemetry.endSpan(llmDedupeSpan, {
1876
+ status: "ok",
1877
+ output: deduped
1878
+ });
1879
+ }
1880
+ telemetry?.endSpan(strategySpan, {
1881
+ status: "ok",
1882
+ output: deduped
1883
+ });
1884
+ return {
1885
+ data: deduped,
1886
+ usage: mergeUsage([...usages, dedupeResponse.usage])
1887
+ };
1888
+ }
1889
+ };
1890
+ var sequentialAutoMerge = (config) => {
1891
+ return new SequentialAutoMergeStrategy(config);
1892
+ };
1893
+
1894
+ // src/strategies/DoublePassStrategy.ts
1895
+ var DoublePassStrategy = class {
1896
+ name = "double-pass";
1897
+ config;
1898
+ constructor(config) {
1899
+ this.config = config;
1900
+ }
1901
+ getEstimatedSteps(artifacts) {
1902
+ const batches = getBatches(artifacts, {
1903
+ maxTokens: this.config.chunkSize,
1904
+ maxImages: this.config.maxImages
1905
+ });
1906
+ return batches.length * 2 + 3;
1907
+ }
1908
+ async run(options) {
1909
+ const debug = options.debug;
1910
+ const { telemetry } = options;
1911
+ const strategySpan = telemetry?.startSpan({
1912
+ name: "strategy.double-pass",
1913
+ kind: "CHAIN",
1914
+ attributes: {
1915
+ "strategy.name": this.name,
1916
+ "strategy.artifacts.count": options.artifacts.length,
1917
+ "strategy.chunk_size": this.config.chunkSize,
1918
+ "strategy.concurrency": this.config.concurrency
1919
+ }
1920
+ });
1921
+ const batches = getBatches(
1922
+ options.artifacts,
1923
+ {
1924
+ maxTokens: this.config.chunkSize,
1925
+ maxImages: this.config.maxImages
1926
+ },
1927
+ debug,
1928
+ telemetry ?? void 0,
1929
+ strategySpan
1930
+ );
1931
+ const schema = serializeSchema(options.schema);
1932
+ const totalSteps = this.getEstimatedSteps(options.artifacts);
1933
+ let step = 1;
1934
+ const pass1Span = telemetry?.startSpan({
1935
+ name: "struktur.pass_1",
1936
+ kind: "CHAIN",
1937
+ parentSpan: strategySpan,
1938
+ attributes: {
1939
+ "pass.number": 1,
1940
+ "pass.type": "parallel_extraction"
1941
+ }
1942
+ });
1943
+ const tasks = batches.map((batch, index) => async () => {
1944
+ const prompt = buildExtractorPrompt(
1945
+ batch,
1946
+ schema,
1947
+ this.config.outputInstructions
1948
+ );
1949
+ const result = await extractWithPrompt({
1950
+ model: this.config.model,
1951
+ schema: options.schema,
1952
+ system: prompt.system,
1953
+ user: prompt.user,
1954
+ artifacts: batch,
1955
+ events: options.events,
1956
+ execute: this.config.execute,
1957
+ strict: options.strict ?? this.config.strict,
1958
+ debug,
1959
+ callId: `double_pass_1_batch_${index + 1}`,
1960
+ telemetry: telemetry ?? void 0,
1961
+ parentSpan: pass1Span
1962
+ });
1963
+ step += 1;
1964
+ await options.events?.onStep?.({
1965
+ step,
1966
+ total: totalSteps,
1967
+ label: `pass 1 batch ${index + 1}/${batches.length}`
1968
+ });
1969
+ debug?.step({
1970
+ step,
1971
+ total: totalSteps,
1972
+ label: `pass 1 batch ${index + 1}/${batches.length}`,
1973
+ strategy: this.name
1974
+ });
1975
+ return result;
1976
+ });
1977
+ const results = await runConcurrently(
1978
+ tasks,
1979
+ this.config.concurrency ?? batches.length
1980
+ );
1981
+ debug?.mergeStart({
1982
+ mergeId: "double_pass_1_merge",
1983
+ inputCount: results.length,
1984
+ strategy: this.name
1985
+ });
1986
+ const pass1MergeSpan = telemetry?.startSpan({
1987
+ name: "struktur.pass_1_merge",
1988
+ kind: "CHAIN",
1989
+ parentSpan: pass1Span,
1990
+ attributes: {
1991
+ "merge.strategy": "parallel",
1992
+ "merge.input_count": results.length
1993
+ }
1994
+ });
1995
+ const mergePrompt = buildParallelMergerPrompt(
1996
+ schema,
1997
+ results.map((r) => r.data)
1998
+ );
1999
+ const merged = await extractWithPrompt({
2000
+ model: this.config.mergeModel,
2001
+ schema: options.schema,
2002
+ system: mergePrompt.system,
2003
+ user: mergePrompt.user,
2004
+ artifacts: [],
2005
+ events: options.events,
2006
+ execute: this.config.execute,
2007
+ strict: this.config.strict,
2008
+ debug,
2009
+ callId: "double_pass_1_merge",
2010
+ telemetry: telemetry ?? void 0,
2011
+ parentSpan: pass1MergeSpan
2012
+ });
2013
+ step += 1;
2014
+ await options.events?.onStep?.({
2015
+ step,
2016
+ total: totalSteps,
2017
+ label: "pass 1 merge"
2018
+ });
2019
+ debug?.step({
2020
+ step,
2021
+ total: totalSteps,
2022
+ label: "pass 1 merge",
2023
+ strategy: this.name
2024
+ });
2025
+ debug?.mergeComplete({ mergeId: "double_pass_1_merge", success: true });
2026
+ if (pass1MergeSpan && telemetry) {
2027
+ telemetry.recordEvent(pass1MergeSpan, {
2028
+ type: "merge",
2029
+ strategy: "parallel",
2030
+ inputCount: results.length,
2031
+ outputCount: 1
2032
+ });
2033
+ telemetry.endSpan(pass1MergeSpan, {
2034
+ status: "ok",
2035
+ output: merged.data
2036
+ });
2037
+ }
2038
+ telemetry?.endSpan(pass1Span, {
2039
+ status: "ok",
2040
+ output: merged.data
2041
+ });
2042
+ const pass2Span = telemetry?.startSpan({
2043
+ name: "struktur.pass_2",
2044
+ kind: "CHAIN",
2045
+ parentSpan: strategySpan,
2046
+ attributes: {
2047
+ "pass.number": 2,
2048
+ "pass.type": "sequential_refinement"
2049
+ }
2050
+ });
2051
+ let currentData = merged.data;
2052
+ const usages = [...results.map((r) => r.usage), merged.usage];
2053
+ for (const [index, batch] of batches.entries()) {
2054
+ const prompt = buildSequentialPrompt(
2055
+ batch,
2056
+ schema,
2057
+ JSON.stringify(currentData),
2058
+ this.config.outputInstructions
2059
+ );
2060
+ const result = await extractWithPrompt({
2061
+ model: this.config.model,
2062
+ schema: options.schema,
2063
+ system: prompt.system,
2064
+ user: prompt.user,
2065
+ artifacts: batch,
2066
+ events: options.events,
2067
+ execute: this.config.execute,
2068
+ strict: this.config.strict,
2069
+ debug,
2070
+ callId: `double_pass_2_batch_${index + 1}`,
2071
+ telemetry: telemetry ?? void 0,
2072
+ parentSpan: pass2Span
2073
+ });
2074
+ currentData = result.data;
2075
+ usages.push(result.usage);
2076
+ step += 1;
2077
+ await options.events?.onStep?.({
2078
+ step,
2079
+ total: totalSteps,
2080
+ label: `pass 2 batch ${index + 1}/${batches.length}`
2081
+ });
2082
+ debug?.step({
2083
+ step,
2084
+ total: totalSteps,
2085
+ label: `pass 2 batch ${index + 1}/${batches.length}`,
2086
+ strategy: this.name
2087
+ });
2088
+ }
2089
+ telemetry?.endSpan(pass2Span, {
2090
+ status: "ok",
2091
+ output: currentData
2092
+ });
2093
+ telemetry?.endSpan(strategySpan, {
2094
+ status: "ok",
2095
+ output: currentData
2096
+ });
2097
+ return { data: currentData, usage: mergeUsage(usages) };
2098
+ }
2099
+ };
2100
+ var doublePass = (config) => {
2101
+ return new DoublePassStrategy(config);
2102
+ };
2103
+
2104
+ // src/strategies/DoublePassAutoMergeStrategy.ts
2105
+ var dedupeSchema3 = {
2106
+ type: "object",
2107
+ properties: {
2108
+ keys: { type: "array", items: { type: "string" } }
2109
+ },
2110
+ required: ["keys"],
2111
+ additionalProperties: false
2112
+ };
2113
+ var dedupeArrays3 = (data) => {
2114
+ const result = { ...data };
2115
+ for (const [key, value] of Object.entries(result)) {
2116
+ if (Array.isArray(value)) {
2117
+ const duplicates = findExactDuplicatesWithHashing(value);
2118
+ result[key] = deduplicateByIndices(value, duplicates);
2119
+ }
2120
+ }
2121
+ return result;
2122
+ };
2123
+ var removeByPath3 = (data, path) => {
2124
+ const [root, indexStr] = path.split(".");
2125
+ const index = Number(indexStr);
2126
+ if (!root || Number.isNaN(index)) {
2127
+ return data;
2128
+ }
2129
+ const value = data[root];
2130
+ if (!Array.isArray(value)) {
2131
+ return data;
2132
+ }
2133
+ const next = [...value];
2134
+ next.splice(index, 1);
2135
+ return { ...data, [root]: next };
2136
+ };
2137
+ var DoublePassAutoMergeStrategy = class {
2138
+ name = "double-pass-auto-merge";
2139
+ config;
2140
+ constructor(config) {
2141
+ this.config = config;
2142
+ }
2143
+ getEstimatedSteps(artifacts) {
2144
+ const batches = getBatches(artifacts, {
2145
+ maxTokens: this.config.chunkSize,
2146
+ maxImages: this.config.maxImages
2147
+ });
2148
+ return batches.length * 2 + 3;
2149
+ }
2150
+ async run(options) {
2151
+ const debug = options.debug;
2152
+ const { telemetry } = options;
2153
+ const strategySpan = telemetry?.startSpan({
2154
+ name: "strategy.double-pass-auto-merge",
2155
+ kind: "CHAIN",
2156
+ attributes: {
2157
+ "strategy.name": this.name,
2158
+ "strategy.artifacts.count": options.artifacts.length,
2159
+ "strategy.chunk_size": this.config.chunkSize,
2160
+ "strategy.concurrency": this.config.concurrency
2161
+ }
2162
+ });
2163
+ const batches = getBatches(
2164
+ options.artifacts,
2165
+ {
2166
+ maxTokens: this.config.chunkSize,
2167
+ maxImages: this.config.maxImages
2168
+ },
2169
+ debug,
2170
+ telemetry ?? void 0,
2171
+ strategySpan
2172
+ );
2173
+ const schema = serializeSchema(options.schema);
2174
+ const totalSteps = this.getEstimatedSteps(options.artifacts);
2175
+ let step = 1;
2176
+ const pass1Span = telemetry?.startSpan({
2177
+ name: "struktur.pass_1",
2178
+ kind: "CHAIN",
2179
+ parentSpan: strategySpan,
2180
+ attributes: {
2181
+ "pass.number": 1,
2182
+ "pass.type": "parallel_extraction"
2183
+ }
2184
+ });
2185
+ const tasks = batches.map((batch, index) => async () => {
2186
+ const prompt = buildExtractorPrompt(
2187
+ batch,
2188
+ schema,
2189
+ this.config.outputInstructions
2190
+ );
2191
+ const result = await extractWithPrompt({
2192
+ model: this.config.model,
2193
+ schema: options.schema,
2194
+ system: prompt.system,
2195
+ user: prompt.user,
2196
+ artifacts: batch,
2197
+ events: options.events,
2198
+ execute: this.config.execute,
2199
+ strict: options.strict ?? this.config.strict,
2200
+ debug,
2201
+ callId: `double_pass_auto_1_batch_${index + 1}`,
2202
+ telemetry: telemetry ?? void 0,
2203
+ parentSpan: pass1Span
2204
+ });
2205
+ step += 1;
2206
+ await options.events?.onStep?.({
2207
+ step,
2208
+ total: totalSteps,
2209
+ label: `pass 1 batch ${index + 1}/${batches.length}`
2210
+ });
2211
+ debug?.step({
2212
+ step,
2213
+ total: totalSteps,
2214
+ label: `pass 1 batch ${index + 1}/${batches.length}`,
2215
+ strategy: this.name
2216
+ });
2217
+ return result;
2218
+ });
2219
+ const results = await runConcurrently(
2220
+ tasks,
2221
+ this.config.concurrency ?? batches.length
2222
+ );
2223
+ const merger = new SmartDataMerger(
2224
+ options.schema
2225
+ );
2226
+ let merged = {};
2227
+ debug?.mergeStart({
2228
+ mergeId: "double_pass_auto_merge",
2229
+ inputCount: results.length,
2230
+ strategy: this.name
2231
+ });
2232
+ const mergeSpan = telemetry?.startSpan({
2233
+ name: "struktur.smart_merge",
2234
+ kind: "CHAIN",
2235
+ parentSpan: pass1Span,
2236
+ attributes: {
2237
+ "merge.strategy": "smart",
2238
+ "merge.input_count": results.length
2239
+ }
2240
+ });
2241
+ for (let i = 0; i < results.length; i++) {
2242
+ const result = results[i];
2243
+ merged = merger.merge(merged, result.data);
2244
+ for (const key of Object.keys(result.data)) {
2245
+ const leftArray = Array.isArray(merged[key]) ? merged[key].length : void 0;
2246
+ const rightArray = Array.isArray(
2247
+ result.data[key]
2248
+ ) ? result.data[key].length : void 0;
2249
+ debug?.smartMergeField({
2250
+ mergeId: "double_pass_auto_merge",
2251
+ field: key,
2252
+ operation: "merge_arrays",
2253
+ leftCount: leftArray,
2254
+ rightCount: rightArray
2255
+ });
2256
+ if (mergeSpan && telemetry) {
2257
+ telemetry.recordEvent(mergeSpan, {
2258
+ type: "merge",
2259
+ strategy: "smart",
2260
+ inputCount: rightArray ?? 1,
2261
+ outputCount: leftArray ?? 1
2262
+ });
2263
+ }
2264
+ }
2265
+ }
2266
+ debug?.mergeComplete({ mergeId: "double_pass_auto_merge", success: true });
2267
+ if (mergeSpan && telemetry) {
2268
+ telemetry.endSpan(mergeSpan, {
2269
+ status: "ok",
2270
+ output: merged
2271
+ });
2272
+ }
2273
+ merged = dedupeArrays3(merged);
2274
+ const exactDedupeSpan = telemetry?.startSpan({
2275
+ name: "struktur.exact_dedupe",
2276
+ kind: "CHAIN",
2277
+ parentSpan: pass1Span,
2278
+ attributes: {
2279
+ "dedupe.method": "exact_hashing"
2280
+ }
2281
+ });
2282
+ if (exactDedupeSpan && telemetry) {
2283
+ telemetry.recordEvent(exactDedupeSpan, {
2284
+ type: "merge",
2285
+ strategy: "exact_hash_dedupe",
2286
+ inputCount: Object.keys(merged).length,
2287
+ outputCount: Object.keys(merged).length
2288
+ });
2289
+ telemetry.endSpan(exactDedupeSpan, {
2290
+ status: "ok",
2291
+ output: merged
2292
+ });
2293
+ }
2294
+ const dedupePrompt = buildDeduplicationPrompt(schema, merged);
2295
+ debug?.dedupeStart({
2296
+ dedupeId: "double_pass_auto_dedupe",
2297
+ itemCount: Object.keys(merged).length
2298
+ });
2299
+ const llmDedupeSpan = telemetry?.startSpan({
2300
+ name: "struktur.llm_dedupe",
2301
+ kind: "CHAIN",
2302
+ parentSpan: pass1Span,
2303
+ attributes: {
2304
+ "dedupe.method": "llm"
2305
+ }
2306
+ });
2307
+ const dedupeResponse = await runWithRetries({
2308
+ model: this.config.dedupeModel ?? this.config.model,
2309
+ schema: dedupeSchema3,
2310
+ system: dedupePrompt.system,
2311
+ user: dedupePrompt.user,
2312
+ events: options.events,
2313
+ execute: this.config.dedupeExecute,
2314
+ strict: this.config.strict,
2315
+ debug,
2316
+ callId: "double_pass_auto_dedupe",
2317
+ telemetry: telemetry ?? void 0,
2318
+ parentSpan: llmDedupeSpan
2319
+ });
2320
+ step += 1;
2321
+ await options.events?.onStep?.({
2322
+ step,
2323
+ total: totalSteps,
2324
+ label: "pass 1 dedupe"
2325
+ });
2326
+ debug?.step({
2327
+ step,
2328
+ total: totalSteps,
2329
+ label: "pass 1 dedupe",
2330
+ strategy: this.name
2331
+ });
2332
+ let deduped = merged;
2333
+ for (const key of dedupeResponse.data.keys) {
2334
+ deduped = removeByPath3(deduped, key);
2335
+ }
2336
+ debug?.dedupeComplete({
2337
+ dedupeId: "double_pass_auto_dedupe",
2338
+ duplicatesFound: dedupeResponse.data.keys.length,
2339
+ itemsRemoved: dedupeResponse.data.keys.length
2340
+ });
2341
+ if (llmDedupeSpan && telemetry) {
2342
+ telemetry.recordEvent(llmDedupeSpan, {
2343
+ type: "merge",
2344
+ strategy: "llm_dedupe",
2345
+ inputCount: Object.keys(merged).length,
2346
+ outputCount: Object.keys(deduped).length,
2347
+ deduped: dedupeResponse.data.keys.length
2348
+ });
2349
+ telemetry.endSpan(llmDedupeSpan, {
2350
+ status: "ok",
2351
+ output: deduped
2352
+ });
2353
+ }
2354
+ telemetry?.endSpan(pass1Span, {
2355
+ status: "ok",
2356
+ output: deduped
2357
+ });
2358
+ let currentData = deduped;
2359
+ const usages = [...results.map((r) => r.usage), dedupeResponse.usage];
2360
+ const pass2Span = telemetry?.startSpan({
2361
+ name: "struktur.pass_2",
2362
+ kind: "CHAIN",
2363
+ parentSpan: strategySpan,
2364
+ attributes: {
2365
+ "pass.number": 2,
2366
+ "pass.type": "sequential_refinement"
2367
+ }
2368
+ });
2369
+ for (const [index, batch] of batches.entries()) {
2370
+ const prompt = buildSequentialPrompt(
2371
+ batch,
2372
+ schema,
2373
+ JSON.stringify(currentData),
2374
+ this.config.outputInstructions
2375
+ );
2376
+ const result = await extractWithPrompt({
2377
+ model: this.config.model,
2378
+ schema: options.schema,
2379
+ system: prompt.system,
2380
+ user: prompt.user,
2381
+ artifacts: batch,
2382
+ events: options.events,
2383
+ execute: this.config.execute,
2384
+ strict: this.config.strict,
2385
+ debug,
2386
+ callId: `double_pass_auto_2_batch_${index + 1}`,
2387
+ telemetry: telemetry ?? void 0,
2388
+ parentSpan: pass2Span
2389
+ });
2390
+ currentData = result.data;
2391
+ usages.push(result.usage);
2392
+ step += 1;
2393
+ await options.events?.onStep?.({
2394
+ step,
2395
+ total: totalSteps,
2396
+ label: `pass 2 batch ${index + 1}/${batches.length}`
2397
+ });
2398
+ debug?.step({
2399
+ step,
2400
+ total: totalSteps,
2401
+ label: `pass 2 batch ${index + 1}/${batches.length}`,
2402
+ strategy: this.name
2403
+ });
2404
+ }
2405
+ telemetry?.endSpan(pass2Span, {
2406
+ status: "ok",
2407
+ output: currentData
2408
+ });
2409
+ telemetry?.endSpan(strategySpan, {
2410
+ status: "ok",
2411
+ output: currentData
2412
+ });
2413
+ return { data: currentData, usage: mergeUsage(usages) };
2414
+ }
2415
+ };
2416
+ var doublePassAutoMerge = (config) => {
2417
+ return new DoublePassAutoMergeStrategy(config);
2418
+ };
2419
+ export {
2420
+ DoublePassAutoMergeStrategy,
2421
+ DoublePassStrategy,
2422
+ ParallelAutoMergeStrategy,
2423
+ ParallelStrategy,
2424
+ SequentialAutoMergeStrategy,
2425
+ SequentialStrategy,
2426
+ SimpleStrategy,
2427
+ doublePass,
2428
+ doublePassAutoMerge,
2429
+ parallel,
2430
+ parallelAutoMerge,
2431
+ sequential,
2432
+ sequentialAutoMerge,
2433
+ simple
2434
+ };
2435
+ //# sourceMappingURL=strategies.js.map