@struktur/sdk 1.2.1 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +110 -0
- package/README.md +7 -3
- package/package.json +3 -1
- package/src/agent-cli-integration.test.ts +47 -0
- package/src/agent-export.test.ts +17 -0
- package/src/agent-tool-labels.test.ts +50 -0
- package/src/artifacts/AGENTS.md +1 -1
- package/src/auth/config.ts +57 -0
- package/src/extract.ts +55 -19
- package/src/index.ts +17 -0
- package/src/llm/LLMClient.test.ts +198 -0
- package/src/llm/LLMClient.ts +178 -20
- package/src/llm/RetryingRunner.ts +83 -1
- package/src/llm/resolveModel.ts +86 -0
- package/src/strategies/DoublePassAutoMergeStrategy.ts +140 -0
- package/src/strategies/DoublePassStrategy.ts +87 -0
- package/src/strategies/ParallelAutoMergeStrategy.ts +104 -0
- package/src/strategies/ParallelStrategy.ts +51 -0
- package/src/strategies/SequentialAutoMergeStrategy.ts +103 -0
- package/src/strategies/SequentialStrategy.ts +23 -0
- package/src/strategies/SimpleStrategy.ts +20 -0
- package/src/strategies/utils.ts +42 -3
- package/src/types.ts +67 -9
- package/src/validation/AGENTS.md +3 -2
- package/src/validation/validator.test.ts +32 -0
- package/src/validation/validator.ts +8 -0
|
@@ -84,6 +84,20 @@ export class DoublePassAutoMergeStrategy<T> implements ExtractionStrategy<T> {
|
|
|
84
84
|
|
|
85
85
|
async run(options: ExtractionOptions<T>): Promise<ExtractionResult<T>> {
|
|
86
86
|
const debug = options.debug;
|
|
87
|
+
const { telemetry } = options;
|
|
88
|
+
|
|
89
|
+
// Create strategy-level span
|
|
90
|
+
const strategySpan = telemetry?.startSpan({
|
|
91
|
+
name: "strategy.double-pass-auto-merge",
|
|
92
|
+
kind: "CHAIN",
|
|
93
|
+
attributes: {
|
|
94
|
+
"strategy.name": this.name,
|
|
95
|
+
"strategy.artifacts.count": options.artifacts.length,
|
|
96
|
+
"strategy.chunk_size": this.config.chunkSize,
|
|
97
|
+
"strategy.concurrency": this.config.concurrency,
|
|
98
|
+
},
|
|
99
|
+
});
|
|
100
|
+
|
|
87
101
|
const batches = getBatches(
|
|
88
102
|
options.artifacts,
|
|
89
103
|
{
|
|
@@ -91,11 +105,24 @@ export class DoublePassAutoMergeStrategy<T> implements ExtractionStrategy<T> {
|
|
|
91
105
|
maxImages: this.config.maxImages,
|
|
92
106
|
},
|
|
93
107
|
debug,
|
|
108
|
+
telemetry ?? undefined,
|
|
109
|
+
strategySpan,
|
|
94
110
|
);
|
|
95
111
|
|
|
96
112
|
const schema = serializeSchema(options.schema);
|
|
97
113
|
const totalSteps = this.getEstimatedSteps(options.artifacts);
|
|
98
114
|
let step = 1;
|
|
115
|
+
|
|
116
|
+
// Create pass 1 span
|
|
117
|
+
const pass1Span = telemetry?.startSpan({
|
|
118
|
+
name: "struktur.pass_1",
|
|
119
|
+
kind: "CHAIN",
|
|
120
|
+
parentSpan: strategySpan,
|
|
121
|
+
attributes: {
|
|
122
|
+
"pass.number": 1,
|
|
123
|
+
"pass.type": "parallel_extraction",
|
|
124
|
+
},
|
|
125
|
+
});
|
|
99
126
|
|
|
100
127
|
const tasks = batches.map((batch, index) => async () => {
|
|
101
128
|
const prompt = buildExtractorPrompt(
|
|
@@ -114,6 +141,8 @@ export class DoublePassAutoMergeStrategy<T> implements ExtractionStrategy<T> {
|
|
|
114
141
|
strict: options.strict ?? this.config.strict,
|
|
115
142
|
debug,
|
|
116
143
|
callId: `double_pass_auto_1_batch_${index + 1}`,
|
|
144
|
+
telemetry: telemetry ?? undefined,
|
|
145
|
+
parentSpan: pass1Span,
|
|
117
146
|
});
|
|
118
147
|
step += 1;
|
|
119
148
|
await options.events?.onStep?.({
|
|
@@ -145,6 +174,17 @@ export class DoublePassAutoMergeStrategy<T> implements ExtractionStrategy<T> {
|
|
|
145
174
|
inputCount: results.length,
|
|
146
175
|
strategy: this.name,
|
|
147
176
|
});
|
|
177
|
+
|
|
178
|
+
// Create smart merge span
|
|
179
|
+
const mergeSpan = telemetry?.startSpan({
|
|
180
|
+
name: "struktur.smart_merge",
|
|
181
|
+
kind: "CHAIN",
|
|
182
|
+
parentSpan: pass1Span,
|
|
183
|
+
attributes: {
|
|
184
|
+
"merge.strategy": "smart",
|
|
185
|
+
"merge.input_count": results.length,
|
|
186
|
+
},
|
|
187
|
+
});
|
|
148
188
|
|
|
149
189
|
for (let i = 0; i < results.length; i++) {
|
|
150
190
|
const result = results[i]!;
|
|
@@ -168,12 +208,54 @@ export class DoublePassAutoMergeStrategy<T> implements ExtractionStrategy<T> {
|
|
|
168
208
|
leftCount: leftArray,
|
|
169
209
|
rightCount: rightArray,
|
|
170
210
|
});
|
|
211
|
+
|
|
212
|
+
// Record merge event in telemetry
|
|
213
|
+
if (mergeSpan && telemetry) {
|
|
214
|
+
telemetry.recordEvent(mergeSpan, {
|
|
215
|
+
type: "merge",
|
|
216
|
+
strategy: "smart",
|
|
217
|
+
inputCount: rightArray ?? 1,
|
|
218
|
+
outputCount: leftArray ?? 1,
|
|
219
|
+
});
|
|
220
|
+
}
|
|
171
221
|
}
|
|
172
222
|
}
|
|
173
223
|
|
|
174
224
|
debug?.mergeComplete({ mergeId: "double_pass_auto_merge", success: true });
|
|
225
|
+
|
|
226
|
+
// End merge span
|
|
227
|
+
if (mergeSpan && telemetry) {
|
|
228
|
+
telemetry.endSpan(mergeSpan, {
|
|
229
|
+
status: "ok",
|
|
230
|
+
output: merged,
|
|
231
|
+
});
|
|
232
|
+
}
|
|
175
233
|
|
|
176
234
|
merged = dedupeArrays(merged);
|
|
235
|
+
|
|
236
|
+
// Create exact dedupe span
|
|
237
|
+
const exactDedupeSpan = telemetry?.startSpan({
|
|
238
|
+
name: "struktur.exact_dedupe",
|
|
239
|
+
kind: "CHAIN",
|
|
240
|
+
parentSpan: pass1Span,
|
|
241
|
+
attributes: {
|
|
242
|
+
"dedupe.method": "exact_hashing",
|
|
243
|
+
},
|
|
244
|
+
});
|
|
245
|
+
|
|
246
|
+
// End exact dedupe span
|
|
247
|
+
if (exactDedupeSpan && telemetry) {
|
|
248
|
+
telemetry.recordEvent(exactDedupeSpan, {
|
|
249
|
+
type: "merge",
|
|
250
|
+
strategy: "exact_hash_dedupe",
|
|
251
|
+
inputCount: Object.keys(merged).length,
|
|
252
|
+
outputCount: Object.keys(merged).length,
|
|
253
|
+
});
|
|
254
|
+
telemetry.endSpan(exactDedupeSpan, {
|
|
255
|
+
status: "ok",
|
|
256
|
+
output: merged,
|
|
257
|
+
});
|
|
258
|
+
}
|
|
177
259
|
|
|
178
260
|
const dedupePrompt = buildDeduplicationPrompt(schema, merged);
|
|
179
261
|
|
|
@@ -181,6 +263,16 @@ export class DoublePassAutoMergeStrategy<T> implements ExtractionStrategy<T> {
|
|
|
181
263
|
dedupeId: "double_pass_auto_dedupe",
|
|
182
264
|
itemCount: Object.keys(merged).length,
|
|
183
265
|
});
|
|
266
|
+
|
|
267
|
+
// Create LLM dedupe span
|
|
268
|
+
const llmDedupeSpan = telemetry?.startSpan({
|
|
269
|
+
name: "struktur.llm_dedupe",
|
|
270
|
+
kind: "CHAIN",
|
|
271
|
+
parentSpan: pass1Span,
|
|
272
|
+
attributes: {
|
|
273
|
+
"dedupe.method": "llm",
|
|
274
|
+
},
|
|
275
|
+
});
|
|
184
276
|
|
|
185
277
|
const dedupeResponse = await runWithRetries<{ keys: string[] }>({
|
|
186
278
|
model: this.config.dedupeModel ?? this.config.model,
|
|
@@ -192,6 +284,8 @@ export class DoublePassAutoMergeStrategy<T> implements ExtractionStrategy<T> {
|
|
|
192
284
|
strict: this.config.strict,
|
|
193
285
|
debug,
|
|
194
286
|
callId: "double_pass_auto_dedupe",
|
|
287
|
+
telemetry: telemetry ?? undefined,
|
|
288
|
+
parentSpan: llmDedupeSpan,
|
|
195
289
|
});
|
|
196
290
|
|
|
197
291
|
step += 1;
|
|
@@ -217,9 +311,41 @@ export class DoublePassAutoMergeStrategy<T> implements ExtractionStrategy<T> {
|
|
|
217
311
|
duplicatesFound: dedupeResponse.data.keys.length,
|
|
218
312
|
itemsRemoved: dedupeResponse.data.keys.length,
|
|
219
313
|
});
|
|
314
|
+
|
|
315
|
+
// End LLM dedupe span
|
|
316
|
+
if (llmDedupeSpan && telemetry) {
|
|
317
|
+
telemetry.recordEvent(llmDedupeSpan, {
|
|
318
|
+
type: "merge",
|
|
319
|
+
strategy: "llm_dedupe",
|
|
320
|
+
inputCount: Object.keys(merged).length,
|
|
321
|
+
outputCount: Object.keys(deduped).length,
|
|
322
|
+
deduped: dedupeResponse.data.keys.length,
|
|
323
|
+
});
|
|
324
|
+
telemetry.endSpan(llmDedupeSpan, {
|
|
325
|
+
status: "ok",
|
|
326
|
+
output: deduped,
|
|
327
|
+
});
|
|
328
|
+
}
|
|
329
|
+
|
|
330
|
+
// End pass 1 span
|
|
331
|
+
telemetry?.endSpan(pass1Span!, {
|
|
332
|
+
status: "ok",
|
|
333
|
+
output: deduped,
|
|
334
|
+
});
|
|
220
335
|
|
|
221
336
|
let currentData = deduped as T;
|
|
222
337
|
const usages = [...results.map((r) => r.usage), dedupeResponse.usage];
|
|
338
|
+
|
|
339
|
+
// Create pass 2 span
|
|
340
|
+
const pass2Span = telemetry?.startSpan({
|
|
341
|
+
name: "struktur.pass_2",
|
|
342
|
+
kind: "CHAIN",
|
|
343
|
+
parentSpan: strategySpan,
|
|
344
|
+
attributes: {
|
|
345
|
+
"pass.number": 2,
|
|
346
|
+
"pass.type": "sequential_refinement",
|
|
347
|
+
},
|
|
348
|
+
});
|
|
223
349
|
|
|
224
350
|
for (const [index, batch] of batches.entries()) {
|
|
225
351
|
const prompt = buildSequentialPrompt(
|
|
@@ -240,6 +366,8 @@ export class DoublePassAutoMergeStrategy<T> implements ExtractionStrategy<T> {
|
|
|
240
366
|
strict: this.config.strict,
|
|
241
367
|
debug,
|
|
242
368
|
callId: `double_pass_auto_2_batch_${index + 1}`,
|
|
369
|
+
telemetry: telemetry ?? undefined,
|
|
370
|
+
parentSpan: pass2Span,
|
|
243
371
|
});
|
|
244
372
|
|
|
245
373
|
currentData = result.data;
|
|
@@ -258,6 +386,18 @@ export class DoublePassAutoMergeStrategy<T> implements ExtractionStrategy<T> {
|
|
|
258
386
|
strategy: this.name,
|
|
259
387
|
});
|
|
260
388
|
}
|
|
389
|
+
|
|
390
|
+
// End pass 2 span
|
|
391
|
+
telemetry?.endSpan(pass2Span!, {
|
|
392
|
+
status: "ok",
|
|
393
|
+
output: currentData,
|
|
394
|
+
});
|
|
395
|
+
|
|
396
|
+
// End strategy span
|
|
397
|
+
telemetry?.endSpan(strategySpan!, {
|
|
398
|
+
status: "ok",
|
|
399
|
+
output: currentData,
|
|
400
|
+
});
|
|
261
401
|
|
|
262
402
|
return { data: currentData, usage: mergeUsage(usages) };
|
|
263
403
|
}
|
|
@@ -41,6 +41,20 @@ export class DoublePassStrategy<T> implements ExtractionStrategy<T> {
|
|
|
41
41
|
|
|
42
42
|
async run(options: ExtractionOptions<T>): Promise<ExtractionResult<T>> {
|
|
43
43
|
const debug = options.debug;
|
|
44
|
+
const { telemetry } = options;
|
|
45
|
+
|
|
46
|
+
// Create strategy-level span
|
|
47
|
+
const strategySpan = telemetry?.startSpan({
|
|
48
|
+
name: "strategy.double-pass",
|
|
49
|
+
kind: "CHAIN",
|
|
50
|
+
attributes: {
|
|
51
|
+
"strategy.name": this.name,
|
|
52
|
+
"strategy.artifacts.count": options.artifacts.length,
|
|
53
|
+
"strategy.chunk_size": this.config.chunkSize,
|
|
54
|
+
"strategy.concurrency": this.config.concurrency,
|
|
55
|
+
},
|
|
56
|
+
});
|
|
57
|
+
|
|
44
58
|
const batches = getBatches(
|
|
45
59
|
options.artifacts,
|
|
46
60
|
{
|
|
@@ -48,11 +62,24 @@ export class DoublePassStrategy<T> implements ExtractionStrategy<T> {
|
|
|
48
62
|
maxImages: this.config.maxImages,
|
|
49
63
|
},
|
|
50
64
|
debug,
|
|
65
|
+
telemetry ?? undefined,
|
|
66
|
+
strategySpan,
|
|
51
67
|
);
|
|
52
68
|
|
|
53
69
|
const schema = serializeSchema(options.schema);
|
|
54
70
|
const totalSteps = this.getEstimatedSteps(options.artifacts);
|
|
55
71
|
let step = 1;
|
|
72
|
+
|
|
73
|
+
// Create pass 1 span
|
|
74
|
+
const pass1Span = telemetry?.startSpan({
|
|
75
|
+
name: "struktur.pass_1",
|
|
76
|
+
kind: "CHAIN",
|
|
77
|
+
parentSpan: strategySpan,
|
|
78
|
+
attributes: {
|
|
79
|
+
"pass.number": 1,
|
|
80
|
+
"pass.type": "parallel_extraction",
|
|
81
|
+
},
|
|
82
|
+
});
|
|
56
83
|
|
|
57
84
|
const tasks = batches.map((batch, index) => async () => {
|
|
58
85
|
const prompt = buildExtractorPrompt(
|
|
@@ -71,6 +98,8 @@ export class DoublePassStrategy<T> implements ExtractionStrategy<T> {
|
|
|
71
98
|
strict: options.strict ?? this.config.strict,
|
|
72
99
|
debug,
|
|
73
100
|
callId: `double_pass_1_batch_${index + 1}`,
|
|
101
|
+
telemetry: telemetry ?? undefined,
|
|
102
|
+
parentSpan: pass1Span,
|
|
74
103
|
});
|
|
75
104
|
step += 1;
|
|
76
105
|
await options.events?.onStep?.({
|
|
@@ -97,6 +126,17 @@ export class DoublePassStrategy<T> implements ExtractionStrategy<T> {
|
|
|
97
126
|
inputCount: results.length,
|
|
98
127
|
strategy: this.name,
|
|
99
128
|
});
|
|
129
|
+
|
|
130
|
+
// Create pass 1 merge span
|
|
131
|
+
const pass1MergeSpan = telemetry?.startSpan({
|
|
132
|
+
name: "struktur.pass_1_merge",
|
|
133
|
+
kind: "CHAIN",
|
|
134
|
+
parentSpan: pass1Span,
|
|
135
|
+
attributes: {
|
|
136
|
+
"merge.strategy": "parallel",
|
|
137
|
+
"merge.input_count": results.length,
|
|
138
|
+
},
|
|
139
|
+
});
|
|
100
140
|
|
|
101
141
|
const mergePrompt = buildParallelMergerPrompt(
|
|
102
142
|
schema,
|
|
@@ -113,6 +153,8 @@ export class DoublePassStrategy<T> implements ExtractionStrategy<T> {
|
|
|
113
153
|
strict: this.config.strict,
|
|
114
154
|
debug,
|
|
115
155
|
callId: "double_pass_1_merge",
|
|
156
|
+
telemetry: telemetry ?? undefined,
|
|
157
|
+
parentSpan: pass1MergeSpan,
|
|
116
158
|
});
|
|
117
159
|
|
|
118
160
|
step += 1;
|
|
@@ -128,6 +170,37 @@ export class DoublePassStrategy<T> implements ExtractionStrategy<T> {
|
|
|
128
170
|
strategy: this.name,
|
|
129
171
|
});
|
|
130
172
|
debug?.mergeComplete({ mergeId: "double_pass_1_merge", success: true });
|
|
173
|
+
|
|
174
|
+
// End pass 1 merge span
|
|
175
|
+
if (pass1MergeSpan && telemetry) {
|
|
176
|
+
telemetry.recordEvent(pass1MergeSpan, {
|
|
177
|
+
type: "merge",
|
|
178
|
+
strategy: "parallel",
|
|
179
|
+
inputCount: results.length,
|
|
180
|
+
outputCount: 1,
|
|
181
|
+
});
|
|
182
|
+
telemetry.endSpan(pass1MergeSpan, {
|
|
183
|
+
status: "ok",
|
|
184
|
+
output: merged.data,
|
|
185
|
+
});
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
// End pass 1 span
|
|
189
|
+
telemetry?.endSpan(pass1Span!, {
|
|
190
|
+
status: "ok",
|
|
191
|
+
output: merged.data,
|
|
192
|
+
});
|
|
193
|
+
|
|
194
|
+
// Create pass 2 span
|
|
195
|
+
const pass2Span = telemetry?.startSpan({
|
|
196
|
+
name: "struktur.pass_2",
|
|
197
|
+
kind: "CHAIN",
|
|
198
|
+
parentSpan: strategySpan,
|
|
199
|
+
attributes: {
|
|
200
|
+
"pass.number": 2,
|
|
201
|
+
"pass.type": "sequential_refinement",
|
|
202
|
+
},
|
|
203
|
+
});
|
|
131
204
|
|
|
132
205
|
let currentData = merged.data;
|
|
133
206
|
const usages = [...results.map((r) => r.usage), merged.usage];
|
|
@@ -151,6 +224,8 @@ export class DoublePassStrategy<T> implements ExtractionStrategy<T> {
|
|
|
151
224
|
strict: this.config.strict,
|
|
152
225
|
debug,
|
|
153
226
|
callId: `double_pass_2_batch_${index + 1}`,
|
|
227
|
+
telemetry: telemetry ?? undefined,
|
|
228
|
+
parentSpan: pass2Span,
|
|
154
229
|
});
|
|
155
230
|
|
|
156
231
|
currentData = result.data;
|
|
@@ -169,6 +244,18 @@ export class DoublePassStrategy<T> implements ExtractionStrategy<T> {
|
|
|
169
244
|
strategy: this.name,
|
|
170
245
|
});
|
|
171
246
|
}
|
|
247
|
+
|
|
248
|
+
// End pass 2 span
|
|
249
|
+
telemetry?.endSpan(pass2Span!, {
|
|
250
|
+
status: "ok",
|
|
251
|
+
output: currentData,
|
|
252
|
+
});
|
|
253
|
+
|
|
254
|
+
// End strategy span
|
|
255
|
+
telemetry?.endSpan(strategySpan!, {
|
|
256
|
+
status: "ok",
|
|
257
|
+
output: currentData,
|
|
258
|
+
});
|
|
172
259
|
|
|
173
260
|
return { data: currentData, usage: mergeUsage(usages) };
|
|
174
261
|
}
|
|
@@ -83,6 +83,20 @@ export class ParallelAutoMergeStrategy<T> implements ExtractionStrategy<T> {
|
|
|
83
83
|
|
|
84
84
|
async run(options: ExtractionOptions<T>): Promise<ExtractionResult<T>> {
|
|
85
85
|
const debug = options.debug;
|
|
86
|
+
const { telemetry } = options;
|
|
87
|
+
|
|
88
|
+
// Create strategy-level span
|
|
89
|
+
const strategySpan = telemetry?.startSpan({
|
|
90
|
+
name: "strategy.parallel-auto-merge",
|
|
91
|
+
kind: "CHAIN",
|
|
92
|
+
attributes: {
|
|
93
|
+
"strategy.name": this.name,
|
|
94
|
+
"strategy.artifacts.count": options.artifacts.length,
|
|
95
|
+
"strategy.chunk_size": this.config.chunkSize,
|
|
96
|
+
"strategy.concurrency": this.config.concurrency,
|
|
97
|
+
},
|
|
98
|
+
});
|
|
99
|
+
|
|
86
100
|
const batches = getBatches(
|
|
87
101
|
options.artifacts,
|
|
88
102
|
{
|
|
@@ -90,6 +104,8 @@ export class ParallelAutoMergeStrategy<T> implements ExtractionStrategy<T> {
|
|
|
90
104
|
maxImages: this.config.maxImages,
|
|
91
105
|
},
|
|
92
106
|
debug,
|
|
107
|
+
telemetry ?? undefined,
|
|
108
|
+
strategySpan,
|
|
93
109
|
);
|
|
94
110
|
|
|
95
111
|
const schema = serializeSchema(options.schema);
|
|
@@ -113,6 +129,8 @@ export class ParallelAutoMergeStrategy<T> implements ExtractionStrategy<T> {
|
|
|
113
129
|
strict: options.strict ?? this.config.strict,
|
|
114
130
|
debug,
|
|
115
131
|
callId: `parallel_auto_batch_${index + 1}`,
|
|
132
|
+
telemetry: telemetry ?? undefined,
|
|
133
|
+
parentSpan: strategySpan,
|
|
116
134
|
});
|
|
117
135
|
step += 1;
|
|
118
136
|
await options.events?.onStep?.({
|
|
@@ -144,6 +162,17 @@ export class ParallelAutoMergeStrategy<T> implements ExtractionStrategy<T> {
|
|
|
144
162
|
inputCount: results.length,
|
|
145
163
|
strategy: this.name,
|
|
146
164
|
});
|
|
165
|
+
|
|
166
|
+
// Create smart merge span
|
|
167
|
+
const mergeSpan = telemetry?.startSpan({
|
|
168
|
+
name: "struktur.smart_merge",
|
|
169
|
+
kind: "CHAIN",
|
|
170
|
+
parentSpan: strategySpan,
|
|
171
|
+
attributes: {
|
|
172
|
+
"merge.strategy": "smart",
|
|
173
|
+
"merge.input_count": results.length,
|
|
174
|
+
},
|
|
175
|
+
});
|
|
147
176
|
|
|
148
177
|
for (let i = 0; i < results.length; i++) {
|
|
149
178
|
const result = results[i]!;
|
|
@@ -169,6 +198,16 @@ export class ParallelAutoMergeStrategy<T> implements ExtractionStrategy<T> {
|
|
|
169
198
|
leftCount: leftArray,
|
|
170
199
|
rightCount: rightArray,
|
|
171
200
|
});
|
|
201
|
+
|
|
202
|
+
// Record merge event in telemetry
|
|
203
|
+
if (mergeSpan && telemetry) {
|
|
204
|
+
telemetry.recordEvent(mergeSpan, {
|
|
205
|
+
type: "merge",
|
|
206
|
+
strategy: "smart",
|
|
207
|
+
inputCount: rightArray ?? 1,
|
|
208
|
+
outputCount: leftArray ?? 1,
|
|
209
|
+
});
|
|
210
|
+
}
|
|
172
211
|
}
|
|
173
212
|
}
|
|
174
213
|
|
|
@@ -176,8 +215,40 @@ export class ParallelAutoMergeStrategy<T> implements ExtractionStrategy<T> {
|
|
|
176
215
|
mergeId: "parallel_auto_smart_merge",
|
|
177
216
|
success: true,
|
|
178
217
|
});
|
|
218
|
+
|
|
219
|
+
// End merge span
|
|
220
|
+
if (mergeSpan && telemetry) {
|
|
221
|
+
telemetry.endSpan(mergeSpan, {
|
|
222
|
+
status: "ok",
|
|
223
|
+
output: merged,
|
|
224
|
+
});
|
|
225
|
+
}
|
|
179
226
|
|
|
180
227
|
merged = dedupeArrays(merged);
|
|
228
|
+
|
|
229
|
+
// Create exact dedupe span
|
|
230
|
+
const exactDedupeSpan = telemetry?.startSpan({
|
|
231
|
+
name: "struktur.exact_dedupe",
|
|
232
|
+
kind: "CHAIN",
|
|
233
|
+
parentSpan: strategySpan,
|
|
234
|
+
attributes: {
|
|
235
|
+
"dedupe.method": "exact_hashing",
|
|
236
|
+
},
|
|
237
|
+
});
|
|
238
|
+
|
|
239
|
+
// End exact dedupe span
|
|
240
|
+
if (exactDedupeSpan && telemetry) {
|
|
241
|
+
telemetry.recordEvent(exactDedupeSpan, {
|
|
242
|
+
type: "merge",
|
|
243
|
+
strategy: "exact_hash_dedupe",
|
|
244
|
+
inputCount: Object.keys(merged).length,
|
|
245
|
+
outputCount: Object.keys(merged).length,
|
|
246
|
+
});
|
|
247
|
+
telemetry.endSpan(exactDedupeSpan, {
|
|
248
|
+
status: "ok",
|
|
249
|
+
output: merged,
|
|
250
|
+
});
|
|
251
|
+
}
|
|
181
252
|
|
|
182
253
|
const dedupePrompt = buildDeduplicationPrompt(schema, merged);
|
|
183
254
|
|
|
@@ -185,6 +256,16 @@ export class ParallelAutoMergeStrategy<T> implements ExtractionStrategy<T> {
|
|
|
185
256
|
dedupeId: "parallel_auto_dedupe",
|
|
186
257
|
itemCount: Object.keys(merged).length,
|
|
187
258
|
});
|
|
259
|
+
|
|
260
|
+
// Create LLM dedupe span
|
|
261
|
+
const llmDedupeSpan = telemetry?.startSpan({
|
|
262
|
+
name: "struktur.llm_dedupe",
|
|
263
|
+
kind: "CHAIN",
|
|
264
|
+
parentSpan: strategySpan,
|
|
265
|
+
attributes: {
|
|
266
|
+
"dedupe.method": "llm",
|
|
267
|
+
},
|
|
268
|
+
});
|
|
188
269
|
|
|
189
270
|
const dedupeResponse = await runWithRetries<{ keys: string[] }>({
|
|
190
271
|
model: this.config.dedupeModel ?? this.config.model,
|
|
@@ -196,6 +277,8 @@ export class ParallelAutoMergeStrategy<T> implements ExtractionStrategy<T> {
|
|
|
196
277
|
strict: this.config.strict,
|
|
197
278
|
debug,
|
|
198
279
|
callId: "parallel_auto_dedupe",
|
|
280
|
+
telemetry: telemetry ?? undefined,
|
|
281
|
+
parentSpan: llmDedupeSpan,
|
|
199
282
|
});
|
|
200
283
|
|
|
201
284
|
step += 1;
|
|
@@ -221,6 +304,27 @@ export class ParallelAutoMergeStrategy<T> implements ExtractionStrategy<T> {
|
|
|
221
304
|
duplicatesFound: dedupeResponse.data.keys.length,
|
|
222
305
|
itemsRemoved: dedupeResponse.data.keys.length,
|
|
223
306
|
});
|
|
307
|
+
|
|
308
|
+
// End LLM dedupe span
|
|
309
|
+
if (llmDedupeSpan && telemetry) {
|
|
310
|
+
telemetry.recordEvent(llmDedupeSpan, {
|
|
311
|
+
type: "merge",
|
|
312
|
+
strategy: "llm_dedupe",
|
|
313
|
+
inputCount: Object.keys(merged).length,
|
|
314
|
+
outputCount: Object.keys(deduped).length,
|
|
315
|
+
deduped: dedupeResponse.data.keys.length,
|
|
316
|
+
});
|
|
317
|
+
telemetry.endSpan(llmDedupeSpan, {
|
|
318
|
+
status: "ok",
|
|
319
|
+
output: deduped,
|
|
320
|
+
});
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
// End strategy span
|
|
324
|
+
telemetry?.endSpan(strategySpan!, {
|
|
325
|
+
status: "ok",
|
|
326
|
+
output: deduped,
|
|
327
|
+
});
|
|
224
328
|
|
|
225
329
|
return {
|
|
226
330
|
data: deduped as T,
|
|
@@ -40,6 +40,20 @@ export class ParallelStrategy<T> implements ExtractionStrategy<T> {
|
|
|
40
40
|
|
|
41
41
|
async run(options: ExtractionOptions<T>): Promise<ExtractionResult<T>> {
|
|
42
42
|
const debug = options.debug;
|
|
43
|
+
const { telemetry } = options;
|
|
44
|
+
|
|
45
|
+
// Create strategy-level span
|
|
46
|
+
const strategySpan = telemetry?.startSpan({
|
|
47
|
+
name: "strategy.parallel",
|
|
48
|
+
kind: "CHAIN",
|
|
49
|
+
attributes: {
|
|
50
|
+
"strategy.name": this.name,
|
|
51
|
+
"strategy.artifacts.count": options.artifacts.length,
|
|
52
|
+
"strategy.chunk_size": this.config.chunkSize,
|
|
53
|
+
"strategy.concurrency": this.config.concurrency,
|
|
54
|
+
},
|
|
55
|
+
});
|
|
56
|
+
|
|
43
57
|
const batches = getBatches(
|
|
44
58
|
options.artifacts,
|
|
45
59
|
{
|
|
@@ -47,6 +61,8 @@ export class ParallelStrategy<T> implements ExtractionStrategy<T> {
|
|
|
47
61
|
maxImages: this.config.maxImages,
|
|
48
62
|
},
|
|
49
63
|
debug,
|
|
64
|
+
telemetry ?? undefined,
|
|
65
|
+
strategySpan,
|
|
50
66
|
);
|
|
51
67
|
|
|
52
68
|
const schema = serializeSchema(options.schema);
|
|
@@ -83,6 +99,8 @@ export class ParallelStrategy<T> implements ExtractionStrategy<T> {
|
|
|
83
99
|
strict: options.strict ?? this.config.strict,
|
|
84
100
|
debug,
|
|
85
101
|
callId: `parallel_batch_${index + 1}`,
|
|
102
|
+
telemetry: telemetry ?? undefined,
|
|
103
|
+
parentSpan: strategySpan,
|
|
86
104
|
});
|
|
87
105
|
// Emit progress after batch completes (if there are more batches)
|
|
88
106
|
const completedIndex = index + 1;
|
|
@@ -113,6 +131,17 @@ export class ParallelStrategy<T> implements ExtractionStrategy<T> {
|
|
|
113
131
|
inputCount: results.length,
|
|
114
132
|
strategy: this.name,
|
|
115
133
|
});
|
|
134
|
+
|
|
135
|
+
// Create merge span
|
|
136
|
+
const mergeSpan = telemetry?.startSpan({
|
|
137
|
+
name: "struktur.merge",
|
|
138
|
+
kind: "CHAIN",
|
|
139
|
+
parentSpan: strategySpan,
|
|
140
|
+
attributes: {
|
|
141
|
+
"merge.strategy": "parallel",
|
|
142
|
+
"merge.input_count": results.length,
|
|
143
|
+
},
|
|
144
|
+
});
|
|
116
145
|
|
|
117
146
|
const mergePrompt = buildParallelMergerPrompt(
|
|
118
147
|
schema,
|
|
@@ -129,6 +158,8 @@ export class ParallelStrategy<T> implements ExtractionStrategy<T> {
|
|
|
129
158
|
strict: this.config.strict,
|
|
130
159
|
debug,
|
|
131
160
|
callId: "parallel_merge",
|
|
161
|
+
telemetry: telemetry ?? undefined,
|
|
162
|
+
parentSpan: mergeSpan,
|
|
132
163
|
});
|
|
133
164
|
|
|
134
165
|
step += 1;
|
|
@@ -144,6 +175,26 @@ export class ParallelStrategy<T> implements ExtractionStrategy<T> {
|
|
|
144
175
|
strategy: this.name,
|
|
145
176
|
});
|
|
146
177
|
debug?.mergeComplete({ mergeId: "parallel_merge", success: true });
|
|
178
|
+
|
|
179
|
+
// End merge span
|
|
180
|
+
if (mergeSpan && telemetry) {
|
|
181
|
+
telemetry.recordEvent(mergeSpan, {
|
|
182
|
+
type: "merge",
|
|
183
|
+
strategy: "parallel",
|
|
184
|
+
inputCount: results.length,
|
|
185
|
+
outputCount: 1,
|
|
186
|
+
});
|
|
187
|
+
telemetry.endSpan(mergeSpan, {
|
|
188
|
+
status: "ok",
|
|
189
|
+
output: merged.data,
|
|
190
|
+
});
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
// End strategy span
|
|
194
|
+
telemetry?.endSpan(strategySpan!, {
|
|
195
|
+
status: "ok",
|
|
196
|
+
output: merged.data,
|
|
197
|
+
});
|
|
147
198
|
|
|
148
199
|
return {
|
|
149
200
|
data: merged.data,
|