@struktur/sdk 2.0.0 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +2 -2
- package/src/auth/config.ts +57 -0
- package/src/extract.ts +55 -19
- package/src/index.ts +13 -0
- package/src/llm/LLMClient.ts +88 -7
- package/src/llm/RetryingRunner.ts +83 -1
- package/src/strategies/DoublePassAutoMergeStrategy.ts +140 -0
- package/src/strategies/DoublePassStrategy.ts +87 -0
- package/src/strategies/ParallelAutoMergeStrategy.ts +104 -0
- package/src/strategies/ParallelStrategy.ts +51 -0
- package/src/strategies/SequentialAutoMergeStrategy.ts +103 -0
- package/src/strategies/SequentialStrategy.ts +23 -0
- package/src/strategies/SimpleStrategy.ts +20 -0
- package/src/strategies/utils.ts +42 -3
- package/src/types.ts +66 -9
|
@@ -41,6 +41,20 @@ export class DoublePassStrategy<T> implements ExtractionStrategy<T> {
|
|
|
41
41
|
|
|
42
42
|
async run(options: ExtractionOptions<T>): Promise<ExtractionResult<T>> {
|
|
43
43
|
const debug = options.debug;
|
|
44
|
+
const { telemetry } = options;
|
|
45
|
+
|
|
46
|
+
// Create strategy-level span
|
|
47
|
+
const strategySpan = telemetry?.startSpan({
|
|
48
|
+
name: "strategy.double-pass",
|
|
49
|
+
kind: "CHAIN",
|
|
50
|
+
attributes: {
|
|
51
|
+
"strategy.name": this.name,
|
|
52
|
+
"strategy.artifacts.count": options.artifacts.length,
|
|
53
|
+
"strategy.chunk_size": this.config.chunkSize,
|
|
54
|
+
"strategy.concurrency": this.config.concurrency,
|
|
55
|
+
},
|
|
56
|
+
});
|
|
57
|
+
|
|
44
58
|
const batches = getBatches(
|
|
45
59
|
options.artifacts,
|
|
46
60
|
{
|
|
@@ -48,11 +62,24 @@ export class DoublePassStrategy<T> implements ExtractionStrategy<T> {
|
|
|
48
62
|
maxImages: this.config.maxImages,
|
|
49
63
|
},
|
|
50
64
|
debug,
|
|
65
|
+
telemetry ?? undefined,
|
|
66
|
+
strategySpan,
|
|
51
67
|
);
|
|
52
68
|
|
|
53
69
|
const schema = serializeSchema(options.schema);
|
|
54
70
|
const totalSteps = this.getEstimatedSteps(options.artifacts);
|
|
55
71
|
let step = 1;
|
|
72
|
+
|
|
73
|
+
// Create pass 1 span
|
|
74
|
+
const pass1Span = telemetry?.startSpan({
|
|
75
|
+
name: "struktur.pass_1",
|
|
76
|
+
kind: "CHAIN",
|
|
77
|
+
parentSpan: strategySpan,
|
|
78
|
+
attributes: {
|
|
79
|
+
"pass.number": 1,
|
|
80
|
+
"pass.type": "parallel_extraction",
|
|
81
|
+
},
|
|
82
|
+
});
|
|
56
83
|
|
|
57
84
|
const tasks = batches.map((batch, index) => async () => {
|
|
58
85
|
const prompt = buildExtractorPrompt(
|
|
@@ -71,6 +98,8 @@ export class DoublePassStrategy<T> implements ExtractionStrategy<T> {
|
|
|
71
98
|
strict: options.strict ?? this.config.strict,
|
|
72
99
|
debug,
|
|
73
100
|
callId: `double_pass_1_batch_${index + 1}`,
|
|
101
|
+
telemetry: telemetry ?? undefined,
|
|
102
|
+
parentSpan: pass1Span,
|
|
74
103
|
});
|
|
75
104
|
step += 1;
|
|
76
105
|
await options.events?.onStep?.({
|
|
@@ -97,6 +126,17 @@ export class DoublePassStrategy<T> implements ExtractionStrategy<T> {
|
|
|
97
126
|
inputCount: results.length,
|
|
98
127
|
strategy: this.name,
|
|
99
128
|
});
|
|
129
|
+
|
|
130
|
+
// Create pass 1 merge span
|
|
131
|
+
const pass1MergeSpan = telemetry?.startSpan({
|
|
132
|
+
name: "struktur.pass_1_merge",
|
|
133
|
+
kind: "CHAIN",
|
|
134
|
+
parentSpan: pass1Span,
|
|
135
|
+
attributes: {
|
|
136
|
+
"merge.strategy": "parallel",
|
|
137
|
+
"merge.input_count": results.length,
|
|
138
|
+
},
|
|
139
|
+
});
|
|
100
140
|
|
|
101
141
|
const mergePrompt = buildParallelMergerPrompt(
|
|
102
142
|
schema,
|
|
@@ -113,6 +153,8 @@ export class DoublePassStrategy<T> implements ExtractionStrategy<T> {
|
|
|
113
153
|
strict: this.config.strict,
|
|
114
154
|
debug,
|
|
115
155
|
callId: "double_pass_1_merge",
|
|
156
|
+
telemetry: telemetry ?? undefined,
|
|
157
|
+
parentSpan: pass1MergeSpan,
|
|
116
158
|
});
|
|
117
159
|
|
|
118
160
|
step += 1;
|
|
@@ -128,6 +170,37 @@ export class DoublePassStrategy<T> implements ExtractionStrategy<T> {
|
|
|
128
170
|
strategy: this.name,
|
|
129
171
|
});
|
|
130
172
|
debug?.mergeComplete({ mergeId: "double_pass_1_merge", success: true });
|
|
173
|
+
|
|
174
|
+
// End pass 1 merge span
|
|
175
|
+
if (pass1MergeSpan && telemetry) {
|
|
176
|
+
telemetry.recordEvent(pass1MergeSpan, {
|
|
177
|
+
type: "merge",
|
|
178
|
+
strategy: "parallel",
|
|
179
|
+
inputCount: results.length,
|
|
180
|
+
outputCount: 1,
|
|
181
|
+
});
|
|
182
|
+
telemetry.endSpan(pass1MergeSpan, {
|
|
183
|
+
status: "ok",
|
|
184
|
+
output: merged.data,
|
|
185
|
+
});
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
// End pass 1 span
|
|
189
|
+
telemetry?.endSpan(pass1Span!, {
|
|
190
|
+
status: "ok",
|
|
191
|
+
output: merged.data,
|
|
192
|
+
});
|
|
193
|
+
|
|
194
|
+
// Create pass 2 span
|
|
195
|
+
const pass2Span = telemetry?.startSpan({
|
|
196
|
+
name: "struktur.pass_2",
|
|
197
|
+
kind: "CHAIN",
|
|
198
|
+
parentSpan: strategySpan,
|
|
199
|
+
attributes: {
|
|
200
|
+
"pass.number": 2,
|
|
201
|
+
"pass.type": "sequential_refinement",
|
|
202
|
+
},
|
|
203
|
+
});
|
|
131
204
|
|
|
132
205
|
let currentData = merged.data;
|
|
133
206
|
const usages = [...results.map((r) => r.usage), merged.usage];
|
|
@@ -151,6 +224,8 @@ export class DoublePassStrategy<T> implements ExtractionStrategy<T> {
|
|
|
151
224
|
strict: this.config.strict,
|
|
152
225
|
debug,
|
|
153
226
|
callId: `double_pass_2_batch_${index + 1}`,
|
|
227
|
+
telemetry: telemetry ?? undefined,
|
|
228
|
+
parentSpan: pass2Span,
|
|
154
229
|
});
|
|
155
230
|
|
|
156
231
|
currentData = result.data;
|
|
@@ -169,6 +244,18 @@ export class DoublePassStrategy<T> implements ExtractionStrategy<T> {
|
|
|
169
244
|
strategy: this.name,
|
|
170
245
|
});
|
|
171
246
|
}
|
|
247
|
+
|
|
248
|
+
// End pass 2 span
|
|
249
|
+
telemetry?.endSpan(pass2Span!, {
|
|
250
|
+
status: "ok",
|
|
251
|
+
output: currentData,
|
|
252
|
+
});
|
|
253
|
+
|
|
254
|
+
// End strategy span
|
|
255
|
+
telemetry?.endSpan(strategySpan!, {
|
|
256
|
+
status: "ok",
|
|
257
|
+
output: currentData,
|
|
258
|
+
});
|
|
172
259
|
|
|
173
260
|
return { data: currentData, usage: mergeUsage(usages) };
|
|
174
261
|
}
|
|
@@ -83,6 +83,20 @@ export class ParallelAutoMergeStrategy<T> implements ExtractionStrategy<T> {
|
|
|
83
83
|
|
|
84
84
|
async run(options: ExtractionOptions<T>): Promise<ExtractionResult<T>> {
|
|
85
85
|
const debug = options.debug;
|
|
86
|
+
const { telemetry } = options;
|
|
87
|
+
|
|
88
|
+
// Create strategy-level span
|
|
89
|
+
const strategySpan = telemetry?.startSpan({
|
|
90
|
+
name: "strategy.parallel-auto-merge",
|
|
91
|
+
kind: "CHAIN",
|
|
92
|
+
attributes: {
|
|
93
|
+
"strategy.name": this.name,
|
|
94
|
+
"strategy.artifacts.count": options.artifacts.length,
|
|
95
|
+
"strategy.chunk_size": this.config.chunkSize,
|
|
96
|
+
"strategy.concurrency": this.config.concurrency,
|
|
97
|
+
},
|
|
98
|
+
});
|
|
99
|
+
|
|
86
100
|
const batches = getBatches(
|
|
87
101
|
options.artifacts,
|
|
88
102
|
{
|
|
@@ -90,6 +104,8 @@ export class ParallelAutoMergeStrategy<T> implements ExtractionStrategy<T> {
|
|
|
90
104
|
maxImages: this.config.maxImages,
|
|
91
105
|
},
|
|
92
106
|
debug,
|
|
107
|
+
telemetry ?? undefined,
|
|
108
|
+
strategySpan,
|
|
93
109
|
);
|
|
94
110
|
|
|
95
111
|
const schema = serializeSchema(options.schema);
|
|
@@ -113,6 +129,8 @@ export class ParallelAutoMergeStrategy<T> implements ExtractionStrategy<T> {
|
|
|
113
129
|
strict: options.strict ?? this.config.strict,
|
|
114
130
|
debug,
|
|
115
131
|
callId: `parallel_auto_batch_${index + 1}`,
|
|
132
|
+
telemetry: telemetry ?? undefined,
|
|
133
|
+
parentSpan: strategySpan,
|
|
116
134
|
});
|
|
117
135
|
step += 1;
|
|
118
136
|
await options.events?.onStep?.({
|
|
@@ -144,6 +162,17 @@ export class ParallelAutoMergeStrategy<T> implements ExtractionStrategy<T> {
|
|
|
144
162
|
inputCount: results.length,
|
|
145
163
|
strategy: this.name,
|
|
146
164
|
});
|
|
165
|
+
|
|
166
|
+
// Create smart merge span
|
|
167
|
+
const mergeSpan = telemetry?.startSpan({
|
|
168
|
+
name: "struktur.smart_merge",
|
|
169
|
+
kind: "CHAIN",
|
|
170
|
+
parentSpan: strategySpan,
|
|
171
|
+
attributes: {
|
|
172
|
+
"merge.strategy": "smart",
|
|
173
|
+
"merge.input_count": results.length,
|
|
174
|
+
},
|
|
175
|
+
});
|
|
147
176
|
|
|
148
177
|
for (let i = 0; i < results.length; i++) {
|
|
149
178
|
const result = results[i]!;
|
|
@@ -169,6 +198,16 @@ export class ParallelAutoMergeStrategy<T> implements ExtractionStrategy<T> {
|
|
|
169
198
|
leftCount: leftArray,
|
|
170
199
|
rightCount: rightArray,
|
|
171
200
|
});
|
|
201
|
+
|
|
202
|
+
// Record merge event in telemetry
|
|
203
|
+
if (mergeSpan && telemetry) {
|
|
204
|
+
telemetry.recordEvent(mergeSpan, {
|
|
205
|
+
type: "merge",
|
|
206
|
+
strategy: "smart",
|
|
207
|
+
inputCount: rightArray ?? 1,
|
|
208
|
+
outputCount: leftArray ?? 1,
|
|
209
|
+
});
|
|
210
|
+
}
|
|
172
211
|
}
|
|
173
212
|
}
|
|
174
213
|
|
|
@@ -176,8 +215,40 @@ export class ParallelAutoMergeStrategy<T> implements ExtractionStrategy<T> {
|
|
|
176
215
|
mergeId: "parallel_auto_smart_merge",
|
|
177
216
|
success: true,
|
|
178
217
|
});
|
|
218
|
+
|
|
219
|
+
// End merge span
|
|
220
|
+
if (mergeSpan && telemetry) {
|
|
221
|
+
telemetry.endSpan(mergeSpan, {
|
|
222
|
+
status: "ok",
|
|
223
|
+
output: merged,
|
|
224
|
+
});
|
|
225
|
+
}
|
|
179
226
|
|
|
180
227
|
merged = dedupeArrays(merged);
|
|
228
|
+
|
|
229
|
+
// Create exact dedupe span
|
|
230
|
+
const exactDedupeSpan = telemetry?.startSpan({
|
|
231
|
+
name: "struktur.exact_dedupe",
|
|
232
|
+
kind: "CHAIN",
|
|
233
|
+
parentSpan: strategySpan,
|
|
234
|
+
attributes: {
|
|
235
|
+
"dedupe.method": "exact_hashing",
|
|
236
|
+
},
|
|
237
|
+
});
|
|
238
|
+
|
|
239
|
+
// End exact dedupe span
|
|
240
|
+
if (exactDedupeSpan && telemetry) {
|
|
241
|
+
telemetry.recordEvent(exactDedupeSpan, {
|
|
242
|
+
type: "merge",
|
|
243
|
+
strategy: "exact_hash_dedupe",
|
|
244
|
+
inputCount: Object.keys(merged).length,
|
|
245
|
+
outputCount: Object.keys(merged).length,
|
|
246
|
+
});
|
|
247
|
+
telemetry.endSpan(exactDedupeSpan, {
|
|
248
|
+
status: "ok",
|
|
249
|
+
output: merged,
|
|
250
|
+
});
|
|
251
|
+
}
|
|
181
252
|
|
|
182
253
|
const dedupePrompt = buildDeduplicationPrompt(schema, merged);
|
|
183
254
|
|
|
@@ -185,6 +256,16 @@ export class ParallelAutoMergeStrategy<T> implements ExtractionStrategy<T> {
|
|
|
185
256
|
dedupeId: "parallel_auto_dedupe",
|
|
186
257
|
itemCount: Object.keys(merged).length,
|
|
187
258
|
});
|
|
259
|
+
|
|
260
|
+
// Create LLM dedupe span
|
|
261
|
+
const llmDedupeSpan = telemetry?.startSpan({
|
|
262
|
+
name: "struktur.llm_dedupe",
|
|
263
|
+
kind: "CHAIN",
|
|
264
|
+
parentSpan: strategySpan,
|
|
265
|
+
attributes: {
|
|
266
|
+
"dedupe.method": "llm",
|
|
267
|
+
},
|
|
268
|
+
});
|
|
188
269
|
|
|
189
270
|
const dedupeResponse = await runWithRetries<{ keys: string[] }>({
|
|
190
271
|
model: this.config.dedupeModel ?? this.config.model,
|
|
@@ -196,6 +277,8 @@ export class ParallelAutoMergeStrategy<T> implements ExtractionStrategy<T> {
|
|
|
196
277
|
strict: this.config.strict,
|
|
197
278
|
debug,
|
|
198
279
|
callId: "parallel_auto_dedupe",
|
|
280
|
+
telemetry: telemetry ?? undefined,
|
|
281
|
+
parentSpan: llmDedupeSpan,
|
|
199
282
|
});
|
|
200
283
|
|
|
201
284
|
step += 1;
|
|
@@ -221,6 +304,27 @@ export class ParallelAutoMergeStrategy<T> implements ExtractionStrategy<T> {
|
|
|
221
304
|
duplicatesFound: dedupeResponse.data.keys.length,
|
|
222
305
|
itemsRemoved: dedupeResponse.data.keys.length,
|
|
223
306
|
});
|
|
307
|
+
|
|
308
|
+
// End LLM dedupe span
|
|
309
|
+
if (llmDedupeSpan && telemetry) {
|
|
310
|
+
telemetry.recordEvent(llmDedupeSpan, {
|
|
311
|
+
type: "merge",
|
|
312
|
+
strategy: "llm_dedupe",
|
|
313
|
+
inputCount: Object.keys(merged).length,
|
|
314
|
+
outputCount: Object.keys(deduped).length,
|
|
315
|
+
deduped: dedupeResponse.data.keys.length,
|
|
316
|
+
});
|
|
317
|
+
telemetry.endSpan(llmDedupeSpan, {
|
|
318
|
+
status: "ok",
|
|
319
|
+
output: deduped,
|
|
320
|
+
});
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
// End strategy span
|
|
324
|
+
telemetry?.endSpan(strategySpan!, {
|
|
325
|
+
status: "ok",
|
|
326
|
+
output: deduped,
|
|
327
|
+
});
|
|
224
328
|
|
|
225
329
|
return {
|
|
226
330
|
data: deduped as T,
|
|
@@ -40,6 +40,20 @@ export class ParallelStrategy<T> implements ExtractionStrategy<T> {
|
|
|
40
40
|
|
|
41
41
|
async run(options: ExtractionOptions<T>): Promise<ExtractionResult<T>> {
|
|
42
42
|
const debug = options.debug;
|
|
43
|
+
const { telemetry } = options;
|
|
44
|
+
|
|
45
|
+
// Create strategy-level span
|
|
46
|
+
const strategySpan = telemetry?.startSpan({
|
|
47
|
+
name: "strategy.parallel",
|
|
48
|
+
kind: "CHAIN",
|
|
49
|
+
attributes: {
|
|
50
|
+
"strategy.name": this.name,
|
|
51
|
+
"strategy.artifacts.count": options.artifacts.length,
|
|
52
|
+
"strategy.chunk_size": this.config.chunkSize,
|
|
53
|
+
"strategy.concurrency": this.config.concurrency,
|
|
54
|
+
},
|
|
55
|
+
});
|
|
56
|
+
|
|
43
57
|
const batches = getBatches(
|
|
44
58
|
options.artifacts,
|
|
45
59
|
{
|
|
@@ -47,6 +61,8 @@ export class ParallelStrategy<T> implements ExtractionStrategy<T> {
|
|
|
47
61
|
maxImages: this.config.maxImages,
|
|
48
62
|
},
|
|
49
63
|
debug,
|
|
64
|
+
telemetry ?? undefined,
|
|
65
|
+
strategySpan,
|
|
50
66
|
);
|
|
51
67
|
|
|
52
68
|
const schema = serializeSchema(options.schema);
|
|
@@ -83,6 +99,8 @@ export class ParallelStrategy<T> implements ExtractionStrategy<T> {
|
|
|
83
99
|
strict: options.strict ?? this.config.strict,
|
|
84
100
|
debug,
|
|
85
101
|
callId: `parallel_batch_${index + 1}`,
|
|
102
|
+
telemetry: telemetry ?? undefined,
|
|
103
|
+
parentSpan: strategySpan,
|
|
86
104
|
});
|
|
87
105
|
// Emit progress after batch completes (if there are more batches)
|
|
88
106
|
const completedIndex = index + 1;
|
|
@@ -113,6 +131,17 @@ export class ParallelStrategy<T> implements ExtractionStrategy<T> {
|
|
|
113
131
|
inputCount: results.length,
|
|
114
132
|
strategy: this.name,
|
|
115
133
|
});
|
|
134
|
+
|
|
135
|
+
// Create merge span
|
|
136
|
+
const mergeSpan = telemetry?.startSpan({
|
|
137
|
+
name: "struktur.merge",
|
|
138
|
+
kind: "CHAIN",
|
|
139
|
+
parentSpan: strategySpan,
|
|
140
|
+
attributes: {
|
|
141
|
+
"merge.strategy": "parallel",
|
|
142
|
+
"merge.input_count": results.length,
|
|
143
|
+
},
|
|
144
|
+
});
|
|
116
145
|
|
|
117
146
|
const mergePrompt = buildParallelMergerPrompt(
|
|
118
147
|
schema,
|
|
@@ -129,6 +158,8 @@ export class ParallelStrategy<T> implements ExtractionStrategy<T> {
|
|
|
129
158
|
strict: this.config.strict,
|
|
130
159
|
debug,
|
|
131
160
|
callId: "parallel_merge",
|
|
161
|
+
telemetry: telemetry ?? undefined,
|
|
162
|
+
parentSpan: mergeSpan,
|
|
132
163
|
});
|
|
133
164
|
|
|
134
165
|
step += 1;
|
|
@@ -144,6 +175,26 @@ export class ParallelStrategy<T> implements ExtractionStrategy<T> {
|
|
|
144
175
|
strategy: this.name,
|
|
145
176
|
});
|
|
146
177
|
debug?.mergeComplete({ mergeId: "parallel_merge", success: true });
|
|
178
|
+
|
|
179
|
+
// End merge span
|
|
180
|
+
if (mergeSpan && telemetry) {
|
|
181
|
+
telemetry.recordEvent(mergeSpan, {
|
|
182
|
+
type: "merge",
|
|
183
|
+
strategy: "parallel",
|
|
184
|
+
inputCount: results.length,
|
|
185
|
+
outputCount: 1,
|
|
186
|
+
});
|
|
187
|
+
telemetry.endSpan(mergeSpan, {
|
|
188
|
+
status: "ok",
|
|
189
|
+
output: merged.data,
|
|
190
|
+
});
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
// End strategy span
|
|
194
|
+
telemetry?.endSpan(strategySpan!, {
|
|
195
|
+
status: "ok",
|
|
196
|
+
output: merged.data,
|
|
197
|
+
});
|
|
147
198
|
|
|
148
199
|
return {
|
|
149
200
|
data: merged.data,
|
|
@@ -81,6 +81,19 @@ export class SequentialAutoMergeStrategy<T> implements ExtractionStrategy<T> {
|
|
|
81
81
|
|
|
82
82
|
async run(options: ExtractionOptions<T>): Promise<ExtractionResult<T>> {
|
|
83
83
|
const debug = options.debug;
|
|
84
|
+
const { telemetry } = options;
|
|
85
|
+
|
|
86
|
+
// Create strategy-level span
|
|
87
|
+
const strategySpan = telemetry?.startSpan({
|
|
88
|
+
name: "strategy.sequential-auto-merge",
|
|
89
|
+
kind: "CHAIN",
|
|
90
|
+
attributes: {
|
|
91
|
+
"strategy.name": this.name,
|
|
92
|
+
"strategy.artifacts.count": options.artifacts.length,
|
|
93
|
+
"strategy.chunk_size": this.config.chunkSize,
|
|
94
|
+
},
|
|
95
|
+
});
|
|
96
|
+
|
|
84
97
|
const batches = getBatches(
|
|
85
98
|
options.artifacts,
|
|
86
99
|
{
|
|
@@ -88,6 +101,8 @@ export class SequentialAutoMergeStrategy<T> implements ExtractionStrategy<T> {
|
|
|
88
101
|
maxImages: this.config.maxImages,
|
|
89
102
|
},
|
|
90
103
|
debug,
|
|
104
|
+
telemetry ?? undefined,
|
|
105
|
+
strategySpan,
|
|
91
106
|
);
|
|
92
107
|
|
|
93
108
|
const schema = serializeSchema(options.schema);
|
|
@@ -104,6 +119,17 @@ export class SequentialAutoMergeStrategy<T> implements ExtractionStrategy<T> {
|
|
|
104
119
|
inputCount: batches.length,
|
|
105
120
|
strategy: this.name,
|
|
106
121
|
});
|
|
122
|
+
|
|
123
|
+
// Create smart merge span
|
|
124
|
+
const mergeSpan = telemetry?.startSpan({
|
|
125
|
+
name: "struktur.smart_merge",
|
|
126
|
+
kind: "CHAIN",
|
|
127
|
+
parentSpan: strategySpan,
|
|
128
|
+
attributes: {
|
|
129
|
+
"merge.strategy": "smart",
|
|
130
|
+
"merge.input_count": batches.length,
|
|
131
|
+
},
|
|
132
|
+
});
|
|
107
133
|
|
|
108
134
|
for (const [index, batch] of batches.entries()) {
|
|
109
135
|
const prompt = buildExtractorPrompt(
|
|
@@ -122,6 +148,8 @@ export class SequentialAutoMergeStrategy<T> implements ExtractionStrategy<T> {
|
|
|
122
148
|
strict: options.strict ?? this.config.strict,
|
|
123
149
|
debug,
|
|
124
150
|
callId: `sequential_auto_batch_${index + 1}`,
|
|
151
|
+
telemetry: telemetry ?? undefined,
|
|
152
|
+
parentSpan: mergeSpan,
|
|
125
153
|
});
|
|
126
154
|
|
|
127
155
|
merged = merger.merge(merged, result.data as Record<string, unknown>);
|
|
@@ -145,6 +173,16 @@ export class SequentialAutoMergeStrategy<T> implements ExtractionStrategy<T> {
|
|
|
145
173
|
leftCount: leftArray,
|
|
146
174
|
rightCount: rightArray,
|
|
147
175
|
});
|
|
176
|
+
|
|
177
|
+
// Record merge event in telemetry
|
|
178
|
+
if (mergeSpan && telemetry) {
|
|
179
|
+
telemetry.recordEvent(mergeSpan, {
|
|
180
|
+
type: "merge",
|
|
181
|
+
strategy: "smart",
|
|
182
|
+
inputCount: rightArray ?? 1,
|
|
183
|
+
outputCount: leftArray ?? 1,
|
|
184
|
+
});
|
|
185
|
+
}
|
|
148
186
|
}
|
|
149
187
|
|
|
150
188
|
step += 1;
|
|
@@ -162,8 +200,40 @@ export class SequentialAutoMergeStrategy<T> implements ExtractionStrategy<T> {
|
|
|
162
200
|
}
|
|
163
201
|
|
|
164
202
|
debug?.mergeComplete({ mergeId: "sequential_auto_merge", success: true });
|
|
203
|
+
|
|
204
|
+
// End merge span
|
|
205
|
+
if (mergeSpan && telemetry) {
|
|
206
|
+
telemetry.endSpan(mergeSpan, {
|
|
207
|
+
status: "ok",
|
|
208
|
+
output: merged,
|
|
209
|
+
});
|
|
210
|
+
}
|
|
165
211
|
|
|
166
212
|
merged = dedupeArrays(merged);
|
|
213
|
+
|
|
214
|
+
// Create exact dedupe span
|
|
215
|
+
const exactDedupeSpan = telemetry?.startSpan({
|
|
216
|
+
name: "struktur.exact_dedupe",
|
|
217
|
+
kind: "CHAIN",
|
|
218
|
+
parentSpan: strategySpan,
|
|
219
|
+
attributes: {
|
|
220
|
+
"dedupe.method": "exact_hashing",
|
|
221
|
+
},
|
|
222
|
+
});
|
|
223
|
+
|
|
224
|
+
// End exact dedupe span
|
|
225
|
+
if (exactDedupeSpan && telemetry) {
|
|
226
|
+
telemetry.recordEvent(exactDedupeSpan, {
|
|
227
|
+
type: "merge",
|
|
228
|
+
strategy: "exact_hash_dedupe",
|
|
229
|
+
inputCount: Object.keys(merged).length,
|
|
230
|
+
outputCount: Object.keys(merged).length,
|
|
231
|
+
});
|
|
232
|
+
telemetry.endSpan(exactDedupeSpan, {
|
|
233
|
+
status: "ok",
|
|
234
|
+
output: merged,
|
|
235
|
+
});
|
|
236
|
+
}
|
|
167
237
|
|
|
168
238
|
const dedupePrompt = buildDeduplicationPrompt(schema, merged);
|
|
169
239
|
|
|
@@ -171,6 +241,16 @@ export class SequentialAutoMergeStrategy<T> implements ExtractionStrategy<T> {
|
|
|
171
241
|
dedupeId: "sequential_auto_dedupe",
|
|
172
242
|
itemCount: Object.keys(merged).length,
|
|
173
243
|
});
|
|
244
|
+
|
|
245
|
+
// Create LLM dedupe span
|
|
246
|
+
const llmDedupeSpan = telemetry?.startSpan({
|
|
247
|
+
name: "struktur.llm_dedupe",
|
|
248
|
+
kind: "CHAIN",
|
|
249
|
+
parentSpan: strategySpan,
|
|
250
|
+
attributes: {
|
|
251
|
+
"dedupe.method": "llm",
|
|
252
|
+
},
|
|
253
|
+
});
|
|
174
254
|
|
|
175
255
|
const dedupeResponse = await runWithRetries<{ keys: string[] }>({
|
|
176
256
|
model: this.config.dedupeModel ?? this.config.model,
|
|
@@ -182,6 +262,8 @@ export class SequentialAutoMergeStrategy<T> implements ExtractionStrategy<T> {
|
|
|
182
262
|
strict: this.config.strict,
|
|
183
263
|
debug,
|
|
184
264
|
callId: "sequential_auto_dedupe",
|
|
265
|
+
telemetry: telemetry ?? undefined,
|
|
266
|
+
parentSpan: llmDedupeSpan,
|
|
185
267
|
});
|
|
186
268
|
|
|
187
269
|
step += 1;
|
|
@@ -207,6 +289,27 @@ export class SequentialAutoMergeStrategy<T> implements ExtractionStrategy<T> {
|
|
|
207
289
|
duplicatesFound: dedupeResponse.data.keys.length,
|
|
208
290
|
itemsRemoved: dedupeResponse.data.keys.length,
|
|
209
291
|
});
|
|
292
|
+
|
|
293
|
+
// End LLM dedupe span
|
|
294
|
+
if (llmDedupeSpan && telemetry) {
|
|
295
|
+
telemetry.recordEvent(llmDedupeSpan, {
|
|
296
|
+
type: "merge",
|
|
297
|
+
strategy: "llm_dedupe",
|
|
298
|
+
inputCount: Object.keys(merged).length,
|
|
299
|
+
outputCount: Object.keys(deduped).length,
|
|
300
|
+
deduped: dedupeResponse.data.keys.length,
|
|
301
|
+
});
|
|
302
|
+
telemetry.endSpan(llmDedupeSpan, {
|
|
303
|
+
status: "ok",
|
|
304
|
+
output: deduped,
|
|
305
|
+
});
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
// End strategy span
|
|
309
|
+
telemetry?.endSpan(strategySpan!, {
|
|
310
|
+
status: "ok",
|
|
311
|
+
output: deduped,
|
|
312
|
+
});
|
|
210
313
|
|
|
211
314
|
return {
|
|
212
315
|
data: deduped as T,
|
|
@@ -36,6 +36,19 @@ export class SequentialStrategy<T> implements ExtractionStrategy<T> {
|
|
|
36
36
|
|
|
37
37
|
async run(options: ExtractionOptions<T>): Promise<ExtractionResult<T>> {
|
|
38
38
|
const debug = options.debug;
|
|
39
|
+
const { telemetry } = options;
|
|
40
|
+
|
|
41
|
+
// Create strategy-level span
|
|
42
|
+
const strategySpan = telemetry?.startSpan({
|
|
43
|
+
name: "strategy.sequential",
|
|
44
|
+
kind: "CHAIN",
|
|
45
|
+
attributes: {
|
|
46
|
+
"strategy.name": this.name,
|
|
47
|
+
"strategy.artifacts.count": options.artifacts.length,
|
|
48
|
+
"strategy.chunk_size": this.config.chunkSize,
|
|
49
|
+
},
|
|
50
|
+
});
|
|
51
|
+
|
|
39
52
|
const batches = getBatches(
|
|
40
53
|
options.artifacts,
|
|
41
54
|
{
|
|
@@ -43,6 +56,8 @@ export class SequentialStrategy<T> implements ExtractionStrategy<T> {
|
|
|
43
56
|
maxImages: this.config.maxImages,
|
|
44
57
|
},
|
|
45
58
|
debug,
|
|
59
|
+
telemetry ?? undefined,
|
|
60
|
+
strategySpan,
|
|
46
61
|
);
|
|
47
62
|
|
|
48
63
|
const schema = serializeSchema(options.schema);
|
|
@@ -84,6 +99,8 @@ export class SequentialStrategy<T> implements ExtractionStrategy<T> {
|
|
|
84
99
|
strict: options.strict ?? this.config.strict,
|
|
85
100
|
debug,
|
|
86
101
|
callId: `sequential_batch_${index + 1}`,
|
|
102
|
+
telemetry: telemetry ?? undefined,
|
|
103
|
+
parentSpan: strategySpan,
|
|
87
104
|
});
|
|
88
105
|
|
|
89
106
|
currentData = result.data;
|
|
@@ -110,6 +127,12 @@ export class SequentialStrategy<T> implements ExtractionStrategy<T> {
|
|
|
110
127
|
throw new Error("No data extracted from sequential strategy");
|
|
111
128
|
}
|
|
112
129
|
|
|
130
|
+
// End strategy span
|
|
131
|
+
telemetry?.endSpan(strategySpan!, {
|
|
132
|
+
status: "ok",
|
|
133
|
+
output: currentData,
|
|
134
|
+
});
|
|
135
|
+
|
|
113
136
|
return { data: currentData, usage: mergeUsage(usages) };
|
|
114
137
|
}
|
|
115
138
|
}
|
|
@@ -25,6 +25,18 @@ export class SimpleStrategy<T> implements ExtractionStrategy<T> {
|
|
|
25
25
|
|
|
26
26
|
async run(options: ExtractionOptions<T>): Promise<ExtractionResult<T>> {
|
|
27
27
|
const debug = options.debug;
|
|
28
|
+
const { telemetry } = options;
|
|
29
|
+
|
|
30
|
+
// Create strategy-level span
|
|
31
|
+
const strategySpan = telemetry?.startSpan({
|
|
32
|
+
name: "strategy.simple",
|
|
33
|
+
kind: "CHAIN",
|
|
34
|
+
attributes: {
|
|
35
|
+
"strategy.name": this.name,
|
|
36
|
+
"strategy.artifacts.count": options.artifacts.length,
|
|
37
|
+
},
|
|
38
|
+
});
|
|
39
|
+
|
|
28
40
|
const schema = serializeSchema(options.schema);
|
|
29
41
|
const { system, user } = buildExtractorPrompt(
|
|
30
42
|
options.artifacts,
|
|
@@ -56,6 +68,8 @@ export class SimpleStrategy<T> implements ExtractionStrategy<T> {
|
|
|
56
68
|
strict: options.strict ?? this.config.strict,
|
|
57
69
|
debug,
|
|
58
70
|
callId: "simple_extract",
|
|
71
|
+
telemetry,
|
|
72
|
+
parentSpan: strategySpan,
|
|
59
73
|
});
|
|
60
74
|
|
|
61
75
|
debug?.step({
|
|
@@ -65,6 +79,12 @@ export class SimpleStrategy<T> implements ExtractionStrategy<T> {
|
|
|
65
79
|
strategy: this.name,
|
|
66
80
|
});
|
|
67
81
|
|
|
82
|
+
// End strategy span
|
|
83
|
+
telemetry?.endSpan(strategySpan!, {
|
|
84
|
+
status: "ok",
|
|
85
|
+
output: result.data,
|
|
86
|
+
});
|
|
87
|
+
|
|
68
88
|
return { data: result.data, usage: result.usage };
|
|
69
89
|
}
|
|
70
90
|
}
|