@struktur/sdk 2.1.1 → 2.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +4111 -0
- package/dist/index.js.map +1 -0
- package/dist/parsers.js +492 -0
- package/dist/parsers.js.map +1 -0
- package/dist/strategies.js +2435 -0
- package/dist/strategies.js.map +1 -0
- package/package.json +25 -13
- package/src/agent-cli-integration.test.ts +0 -47
- package/src/agent-export.test.ts +0 -17
- package/src/agent-tool-labels.test.ts +0 -50
- package/src/artifacts/AGENTS.md +0 -16
- package/src/artifacts/fileToArtifact.test.ts +0 -37
- package/src/artifacts/fileToArtifact.ts +0 -44
- package/src/artifacts/input.test.ts +0 -243
- package/src/artifacts/input.ts +0 -360
- package/src/artifacts/providers.test.ts +0 -19
- package/src/artifacts/providers.ts +0 -7
- package/src/artifacts/urlToArtifact.test.ts +0 -23
- package/src/artifacts/urlToArtifact.ts +0 -19
- package/src/auth/AGENTS.md +0 -11
- package/src/auth/config.test.ts +0 -132
- package/src/auth/config.ts +0 -186
- package/src/auth/tokens.test.ts +0 -58
- package/src/auth/tokens.ts +0 -229
- package/src/chunking/AGENTS.md +0 -11
- package/src/chunking/ArtifactBatcher.test.ts +0 -22
- package/src/chunking/ArtifactBatcher.ts +0 -110
- package/src/chunking/ArtifactSplitter.test.ts +0 -38
- package/src/chunking/ArtifactSplitter.ts +0 -151
- package/src/debug/AGENTS.md +0 -79
- package/src/debug/logger.test.ts +0 -244
- package/src/debug/logger.ts +0 -211
- package/src/extract.test.ts +0 -22
- package/src/extract.ts +0 -150
- package/src/fields.test.ts +0 -681
- package/src/fields.ts +0 -246
- package/src/index.test.ts +0 -20
- package/src/index.ts +0 -110
- package/src/llm/AGENTS.md +0 -9
- package/src/llm/LLMClient.test.ts +0 -394
- package/src/llm/LLMClient.ts +0 -264
- package/src/llm/RetryingRunner.test.ts +0 -174
- package/src/llm/RetryingRunner.ts +0 -270
- package/src/llm/message.test.ts +0 -42
- package/src/llm/message.ts +0 -47
- package/src/llm/models.test.ts +0 -82
- package/src/llm/models.ts +0 -190
- package/src/llm/resolveModel.ts +0 -86
- package/src/merge/AGENTS.md +0 -6
- package/src/merge/Deduplicator.test.ts +0 -108
- package/src/merge/Deduplicator.ts +0 -45
- package/src/merge/SmartDataMerger.test.ts +0 -177
- package/src/merge/SmartDataMerger.ts +0 -56
- package/src/parsers/AGENTS.md +0 -58
- package/src/parsers/collect.test.ts +0 -56
- package/src/parsers/collect.ts +0 -31
- package/src/parsers/index.ts +0 -6
- package/src/parsers/mime.test.ts +0 -91
- package/src/parsers/mime.ts +0 -137
- package/src/parsers/npm.ts +0 -26
- package/src/parsers/pdf.test.ts +0 -394
- package/src/parsers/pdf.ts +0 -194
- package/src/parsers/runner.test.ts +0 -95
- package/src/parsers/runner.ts +0 -177
- package/src/parsers/types.ts +0 -29
- package/src/prompts/AGENTS.md +0 -8
- package/src/prompts/DeduplicationPrompt.test.ts +0 -41
- package/src/prompts/DeduplicationPrompt.ts +0 -37
- package/src/prompts/ExtractorPrompt.test.ts +0 -21
- package/src/prompts/ExtractorPrompt.ts +0 -72
- package/src/prompts/ParallelMergerPrompt.test.ts +0 -8
- package/src/prompts/ParallelMergerPrompt.ts +0 -37
- package/src/prompts/SequentialExtractorPrompt.test.ts +0 -24
- package/src/prompts/SequentialExtractorPrompt.ts +0 -82
- package/src/prompts/formatArtifacts.test.ts +0 -39
- package/src/prompts/formatArtifacts.ts +0 -46
- package/src/strategies/AGENTS.md +0 -6
- package/src/strategies/DoublePassAutoMergeStrategy.test.ts +0 -53
- package/src/strategies/DoublePassAutoMergeStrategy.ts +0 -410
- package/src/strategies/DoublePassStrategy.test.ts +0 -48
- package/src/strategies/DoublePassStrategy.ts +0 -266
- package/src/strategies/ParallelAutoMergeStrategy.test.ts +0 -152
- package/src/strategies/ParallelAutoMergeStrategy.ts +0 -345
- package/src/strategies/ParallelStrategy.test.ts +0 -61
- package/src/strategies/ParallelStrategy.ts +0 -208
- package/src/strategies/SequentialAutoMergeStrategy.test.ts +0 -66
- package/src/strategies/SequentialAutoMergeStrategy.ts +0 -325
- package/src/strategies/SequentialStrategy.test.ts +0 -53
- package/src/strategies/SequentialStrategy.ts +0 -142
- package/src/strategies/SimpleStrategy.test.ts +0 -46
- package/src/strategies/SimpleStrategy.ts +0 -94
- package/src/strategies/concurrency.test.ts +0 -16
- package/src/strategies/concurrency.ts +0 -14
- package/src/strategies/index.test.ts +0 -20
- package/src/strategies/index.ts +0 -7
- package/src/strategies/utils.test.ts +0 -76
- package/src/strategies/utils.ts +0 -95
- package/src/tokenization.test.ts +0 -119
- package/src/tokenization.ts +0 -71
- package/src/types.test.ts +0 -25
- package/src/types.ts +0 -174
- package/src/validation/AGENTS.md +0 -7
- package/src/validation/validator.test.ts +0 -204
- package/src/validation/validator.ts +0 -90
- package/tsconfig.json +0 -22
|
@@ -1,345 +0,0 @@
|
|
|
1
|
-
import type { ExtractionResult, ExtractionStrategy } from "../types";
|
|
2
|
-
import type { ExtractionOptions } from "../types";
|
|
3
|
-
import { buildExtractorPrompt } from "../prompts/ExtractorPrompt";
|
|
4
|
-
import { buildDeduplicationPrompt } from "../prompts/DeduplicationPrompt";
|
|
5
|
-
import {
|
|
6
|
-
extractWithPrompt,
|
|
7
|
-
getBatches,
|
|
8
|
-
mergeUsage,
|
|
9
|
-
serializeSchema,
|
|
10
|
-
} from "./utils";
|
|
11
|
-
import { runConcurrently } from "./concurrency";
|
|
12
|
-
import { SmartDataMerger } from "../merge/SmartDataMerger";
|
|
13
|
-
import {
|
|
14
|
-
findExactDuplicatesWithHashing,
|
|
15
|
-
deduplicateByIndices,
|
|
16
|
-
} from "../merge/Deduplicator";
|
|
17
|
-
import { runWithRetries } from "../llm/RetryingRunner";
|
|
18
|
-
|
|
19
|
-
export type ParallelAutoMergeStrategyConfig = {
|
|
20
|
-
model: unknown;
|
|
21
|
-
chunkSize: number;
|
|
22
|
-
concurrency?: number;
|
|
23
|
-
maxImages?: number;
|
|
24
|
-
outputInstructions?: string;
|
|
25
|
-
dedupeModel?: unknown;
|
|
26
|
-
execute?: typeof runWithRetries;
|
|
27
|
-
dedupeExecute?: typeof runWithRetries;
|
|
28
|
-
strict?: boolean;
|
|
29
|
-
};
|
|
30
|
-
|
|
31
|
-
const dedupeSchema = {
|
|
32
|
-
type: "object",
|
|
33
|
-
properties: {
|
|
34
|
-
keys: { type: "array", items: { type: "string" } },
|
|
35
|
-
},
|
|
36
|
-
required: ["keys"],
|
|
37
|
-
additionalProperties: false,
|
|
38
|
-
} as const;
|
|
39
|
-
|
|
40
|
-
const dedupeArrays = (data: Record<string, unknown>) => {
|
|
41
|
-
const result: Record<string, unknown> = { ...data };
|
|
42
|
-
for (const [key, value] of Object.entries(result)) {
|
|
43
|
-
if (Array.isArray(value)) {
|
|
44
|
-
const duplicates = findExactDuplicatesWithHashing(value);
|
|
45
|
-
result[key] = deduplicateByIndices(value, duplicates);
|
|
46
|
-
}
|
|
47
|
-
}
|
|
48
|
-
return result;
|
|
49
|
-
};
|
|
50
|
-
|
|
51
|
-
const removeByPath = (data: Record<string, unknown>, path: string) => {
|
|
52
|
-
const [root, indexStr] = path.split(".");
|
|
53
|
-
const index = Number(indexStr);
|
|
54
|
-
if (!root || Number.isNaN(index)) {
|
|
55
|
-
return data;
|
|
56
|
-
}
|
|
57
|
-
|
|
58
|
-
const value = data[root];
|
|
59
|
-
if (!Array.isArray(value)) {
|
|
60
|
-
return data;
|
|
61
|
-
}
|
|
62
|
-
|
|
63
|
-
const next = [...value];
|
|
64
|
-
next.splice(index, 1);
|
|
65
|
-
return { ...data, [root]: next };
|
|
66
|
-
};
|
|
67
|
-
|
|
68
|
-
export class ParallelAutoMergeStrategy<T> implements ExtractionStrategy<T> {
|
|
69
|
-
public name = "parallel-auto-merge";
|
|
70
|
-
private config: ParallelAutoMergeStrategyConfig;
|
|
71
|
-
|
|
72
|
-
constructor(config: ParallelAutoMergeStrategyConfig) {
|
|
73
|
-
this.config = config;
|
|
74
|
-
}
|
|
75
|
-
|
|
76
|
-
getEstimatedSteps(artifacts: ExtractionOptions<T>["artifacts"]): number {
|
|
77
|
-
const batches = getBatches(artifacts, {
|
|
78
|
-
maxTokens: this.config.chunkSize,
|
|
79
|
-
maxImages: this.config.maxImages,
|
|
80
|
-
});
|
|
81
|
-
return batches.length + 3;
|
|
82
|
-
}
|
|
83
|
-
|
|
84
|
-
async run(options: ExtractionOptions<T>): Promise<ExtractionResult<T>> {
|
|
85
|
-
const debug = options.debug;
|
|
86
|
-
const { telemetry } = options;
|
|
87
|
-
|
|
88
|
-
// Create strategy-level span
|
|
89
|
-
const strategySpan = telemetry?.startSpan({
|
|
90
|
-
name: "strategy.parallel-auto-merge",
|
|
91
|
-
kind: "CHAIN",
|
|
92
|
-
attributes: {
|
|
93
|
-
"strategy.name": this.name,
|
|
94
|
-
"strategy.artifacts.count": options.artifacts.length,
|
|
95
|
-
"strategy.chunk_size": this.config.chunkSize,
|
|
96
|
-
"strategy.concurrency": this.config.concurrency,
|
|
97
|
-
},
|
|
98
|
-
});
|
|
99
|
-
|
|
100
|
-
const batches = getBatches(
|
|
101
|
-
options.artifacts,
|
|
102
|
-
{
|
|
103
|
-
maxTokens: this.config.chunkSize,
|
|
104
|
-
maxImages: this.config.maxImages,
|
|
105
|
-
},
|
|
106
|
-
debug,
|
|
107
|
-
telemetry ?? undefined,
|
|
108
|
-
strategySpan,
|
|
109
|
-
);
|
|
110
|
-
|
|
111
|
-
const schema = serializeSchema(options.schema);
|
|
112
|
-
const totalSteps = this.getEstimatedSteps(options.artifacts);
|
|
113
|
-
let step = 1;
|
|
114
|
-
|
|
115
|
-
const tasks = batches.map((batch, index) => async () => {
|
|
116
|
-
const prompt = buildExtractorPrompt(
|
|
117
|
-
batch,
|
|
118
|
-
schema,
|
|
119
|
-
this.config.outputInstructions,
|
|
120
|
-
);
|
|
121
|
-
const result = await extractWithPrompt<T>({
|
|
122
|
-
model: this.config.model,
|
|
123
|
-
schema: options.schema,
|
|
124
|
-
system: prompt.system,
|
|
125
|
-
user: prompt.user,
|
|
126
|
-
artifacts: batch,
|
|
127
|
-
events: options.events,
|
|
128
|
-
execute: this.config.execute as never,
|
|
129
|
-
strict: options.strict ?? this.config.strict,
|
|
130
|
-
debug,
|
|
131
|
-
callId: `parallel_auto_batch_${index + 1}`,
|
|
132
|
-
telemetry: telemetry ?? undefined,
|
|
133
|
-
parentSpan: strategySpan,
|
|
134
|
-
});
|
|
135
|
-
step += 1;
|
|
136
|
-
await options.events?.onStep?.({
|
|
137
|
-
step,
|
|
138
|
-
total: totalSteps,
|
|
139
|
-
label: `batch ${index + 1}/${batches.length}`,
|
|
140
|
-
});
|
|
141
|
-
debug?.step({
|
|
142
|
-
step,
|
|
143
|
-
total: totalSteps,
|
|
144
|
-
label: `batch ${index + 1}/${batches.length}`,
|
|
145
|
-
strategy: this.name,
|
|
146
|
-
});
|
|
147
|
-
return result;
|
|
148
|
-
});
|
|
149
|
-
|
|
150
|
-
const results = await runConcurrently(
|
|
151
|
-
tasks,
|
|
152
|
-
this.config.concurrency ?? batches.length,
|
|
153
|
-
);
|
|
154
|
-
|
|
155
|
-
const merger = new SmartDataMerger(
|
|
156
|
-
options.schema as Record<string, unknown>,
|
|
157
|
-
);
|
|
158
|
-
let merged = {} as Record<string, unknown>;
|
|
159
|
-
|
|
160
|
-
debug?.mergeStart({
|
|
161
|
-
mergeId: "parallel_auto_smart_merge",
|
|
162
|
-
inputCount: results.length,
|
|
163
|
-
strategy: this.name,
|
|
164
|
-
});
|
|
165
|
-
|
|
166
|
-
// Create smart merge span
|
|
167
|
-
const mergeSpan = telemetry?.startSpan({
|
|
168
|
-
name: "struktur.smart_merge",
|
|
169
|
-
kind: "CHAIN",
|
|
170
|
-
parentSpan: strategySpan,
|
|
171
|
-
attributes: {
|
|
172
|
-
"merge.strategy": "smart",
|
|
173
|
-
"merge.input_count": results.length,
|
|
174
|
-
},
|
|
175
|
-
});
|
|
176
|
-
|
|
177
|
-
for (let i = 0; i < results.length; i++) {
|
|
178
|
-
const result = results[i]!;
|
|
179
|
-
const prevSize = Object.keys(merged).length;
|
|
180
|
-
merged = merger.merge(merged, result.data as Record<string, unknown>);
|
|
181
|
-
const newSize = Object.keys(merged).length;
|
|
182
|
-
|
|
183
|
-
// Log merge operation per field
|
|
184
|
-
for (const key of Object.keys(result.data as Record<string, unknown>)) {
|
|
185
|
-
const leftArray = Array.isArray(merged[key])
|
|
186
|
-
? (merged[key] as unknown[]).length
|
|
187
|
-
: undefined;
|
|
188
|
-
const rightArray = Array.isArray(
|
|
189
|
-
(result.data as Record<string, unknown>)[key],
|
|
190
|
-
)
|
|
191
|
-
? ((result.data as Record<string, unknown>)[key] as unknown[]).length
|
|
192
|
-
: undefined;
|
|
193
|
-
|
|
194
|
-
debug?.smartMergeField({
|
|
195
|
-
mergeId: "parallel_auto_smart_merge",
|
|
196
|
-
field: key,
|
|
197
|
-
operation: "merge_arrays",
|
|
198
|
-
leftCount: leftArray,
|
|
199
|
-
rightCount: rightArray,
|
|
200
|
-
});
|
|
201
|
-
|
|
202
|
-
// Record merge event in telemetry
|
|
203
|
-
if (mergeSpan && telemetry) {
|
|
204
|
-
telemetry.recordEvent(mergeSpan, {
|
|
205
|
-
type: "merge",
|
|
206
|
-
strategy: "smart",
|
|
207
|
-
inputCount: rightArray ?? 1,
|
|
208
|
-
outputCount: leftArray ?? 1,
|
|
209
|
-
});
|
|
210
|
-
}
|
|
211
|
-
}
|
|
212
|
-
}
|
|
213
|
-
|
|
214
|
-
debug?.mergeComplete({
|
|
215
|
-
mergeId: "parallel_auto_smart_merge",
|
|
216
|
-
success: true,
|
|
217
|
-
});
|
|
218
|
-
|
|
219
|
-
// End merge span
|
|
220
|
-
if (mergeSpan && telemetry) {
|
|
221
|
-
telemetry.endSpan(mergeSpan, {
|
|
222
|
-
status: "ok",
|
|
223
|
-
output: merged,
|
|
224
|
-
});
|
|
225
|
-
}
|
|
226
|
-
|
|
227
|
-
merged = dedupeArrays(merged);
|
|
228
|
-
|
|
229
|
-
// Create exact dedupe span
|
|
230
|
-
const exactDedupeSpan = telemetry?.startSpan({
|
|
231
|
-
name: "struktur.exact_dedupe",
|
|
232
|
-
kind: "CHAIN",
|
|
233
|
-
parentSpan: strategySpan,
|
|
234
|
-
attributes: {
|
|
235
|
-
"dedupe.method": "exact_hashing",
|
|
236
|
-
},
|
|
237
|
-
});
|
|
238
|
-
|
|
239
|
-
// End exact dedupe span
|
|
240
|
-
if (exactDedupeSpan && telemetry) {
|
|
241
|
-
telemetry.recordEvent(exactDedupeSpan, {
|
|
242
|
-
type: "merge",
|
|
243
|
-
strategy: "exact_hash_dedupe",
|
|
244
|
-
inputCount: Object.keys(merged).length,
|
|
245
|
-
outputCount: Object.keys(merged).length,
|
|
246
|
-
});
|
|
247
|
-
telemetry.endSpan(exactDedupeSpan, {
|
|
248
|
-
status: "ok",
|
|
249
|
-
output: merged,
|
|
250
|
-
});
|
|
251
|
-
}
|
|
252
|
-
|
|
253
|
-
const dedupePrompt = buildDeduplicationPrompt(schema, merged);
|
|
254
|
-
|
|
255
|
-
debug?.dedupeStart({
|
|
256
|
-
dedupeId: "parallel_auto_dedupe",
|
|
257
|
-
itemCount: Object.keys(merged).length,
|
|
258
|
-
});
|
|
259
|
-
|
|
260
|
-
// Create LLM dedupe span
|
|
261
|
-
const llmDedupeSpan = telemetry?.startSpan({
|
|
262
|
-
name: "struktur.llm_dedupe",
|
|
263
|
-
kind: "CHAIN",
|
|
264
|
-
parentSpan: strategySpan,
|
|
265
|
-
attributes: {
|
|
266
|
-
"dedupe.method": "llm",
|
|
267
|
-
},
|
|
268
|
-
});
|
|
269
|
-
|
|
270
|
-
const dedupeResponse = await runWithRetries<{ keys: string[] }>({
|
|
271
|
-
model: this.config.dedupeModel ?? this.config.model,
|
|
272
|
-
schema: dedupeSchema,
|
|
273
|
-
system: dedupePrompt.system,
|
|
274
|
-
user: dedupePrompt.user,
|
|
275
|
-
events: options.events,
|
|
276
|
-
execute: this.config.dedupeExecute,
|
|
277
|
-
strict: this.config.strict,
|
|
278
|
-
debug,
|
|
279
|
-
callId: "parallel_auto_dedupe",
|
|
280
|
-
telemetry: telemetry ?? undefined,
|
|
281
|
-
parentSpan: llmDedupeSpan,
|
|
282
|
-
});
|
|
283
|
-
|
|
284
|
-
step += 1;
|
|
285
|
-
await options.events?.onStep?.({
|
|
286
|
-
step,
|
|
287
|
-
total: totalSteps,
|
|
288
|
-
label: "dedupe",
|
|
289
|
-
});
|
|
290
|
-
debug?.step({
|
|
291
|
-
step,
|
|
292
|
-
total: totalSteps,
|
|
293
|
-
label: "dedupe",
|
|
294
|
-
strategy: this.name,
|
|
295
|
-
});
|
|
296
|
-
|
|
297
|
-
let deduped = merged;
|
|
298
|
-
for (const key of dedupeResponse.data.keys) {
|
|
299
|
-
deduped = removeByPath(deduped, key);
|
|
300
|
-
}
|
|
301
|
-
|
|
302
|
-
debug?.dedupeComplete({
|
|
303
|
-
dedupeId: "parallel_auto_dedupe",
|
|
304
|
-
duplicatesFound: dedupeResponse.data.keys.length,
|
|
305
|
-
itemsRemoved: dedupeResponse.data.keys.length,
|
|
306
|
-
});
|
|
307
|
-
|
|
308
|
-
// End LLM dedupe span
|
|
309
|
-
if (llmDedupeSpan && telemetry) {
|
|
310
|
-
telemetry.recordEvent(llmDedupeSpan, {
|
|
311
|
-
type: "merge",
|
|
312
|
-
strategy: "llm_dedupe",
|
|
313
|
-
inputCount: Object.keys(merged).length,
|
|
314
|
-
outputCount: Object.keys(deduped).length,
|
|
315
|
-
deduped: dedupeResponse.data.keys.length,
|
|
316
|
-
});
|
|
317
|
-
telemetry.endSpan(llmDedupeSpan, {
|
|
318
|
-
status: "ok",
|
|
319
|
-
output: deduped,
|
|
320
|
-
});
|
|
321
|
-
}
|
|
322
|
-
|
|
323
|
-
// End strategy span
|
|
324
|
-
telemetry?.endSpan(strategySpan!, {
|
|
325
|
-
status: "ok",
|
|
326
|
-
output: deduped,
|
|
327
|
-
});
|
|
328
|
-
|
|
329
|
-
return {
|
|
330
|
-
data: deduped as T,
|
|
331
|
-
usage: mergeUsage([...results.map((r) => r.usage), dedupeResponse.usage]),
|
|
332
|
-
};
|
|
333
|
-
}
|
|
334
|
-
}
|
|
335
|
-
|
|
336
|
-
export const parallelAutoMerge = <T>(
|
|
337
|
-
config: ParallelAutoMergeStrategyConfig,
|
|
338
|
-
) => {
|
|
339
|
-
return new ParallelAutoMergeStrategy<T>(config);
|
|
340
|
-
};
|
|
341
|
-
|
|
342
|
-
export const __testing__ = {
|
|
343
|
-
dedupeArrays,
|
|
344
|
-
removeByPath,
|
|
345
|
-
};
|
|
@@ -1,61 +0,0 @@
|
|
|
1
|
-
import { test, expect } from "bun:test";
|
|
2
|
-
import type { JSONSchemaType } from "ajv";
|
|
3
|
-
import { ParallelStrategy } from "./ParallelStrategy";
|
|
4
|
-
import type { Artifact, ExtractionOptions } from "../types";
|
|
5
|
-
|
|
6
|
-
type Output = { title: string };
|
|
7
|
-
|
|
8
|
-
const schema: JSONSchemaType<Output> = {
|
|
9
|
-
type: "object",
|
|
10
|
-
properties: { title: { type: "string" } },
|
|
11
|
-
required: ["title"],
|
|
12
|
-
additionalProperties: false,
|
|
13
|
-
};
|
|
14
|
-
|
|
15
|
-
const artifacts: Artifact[] = [
|
|
16
|
-
{
|
|
17
|
-
id: "a1",
|
|
18
|
-
type: "text",
|
|
19
|
-
raw: async () => Buffer.from(""),
|
|
20
|
-
contents: [{ text: "abcdefgh" }],
|
|
21
|
-
},
|
|
22
|
-
{
|
|
23
|
-
id: "a2",
|
|
24
|
-
type: "text",
|
|
25
|
-
raw: async () => Buffer.from(""),
|
|
26
|
-
contents: [{ text: "abcdefgh" }],
|
|
27
|
-
},
|
|
28
|
-
];
|
|
29
|
-
|
|
30
|
-
test("ParallelStrategy merges batch results", async () => {
|
|
31
|
-
let calls = 0;
|
|
32
|
-
const strategy = new ParallelStrategy<Output>({
|
|
33
|
-
model: {},
|
|
34
|
-
mergeModel: {},
|
|
35
|
-
chunkSize: 2,
|
|
36
|
-
execute: (async (request: any) => {
|
|
37
|
-
calls += 1;
|
|
38
|
-
const userText = typeof request.user === "string" ? request.user : "";
|
|
39
|
-
if (userText.includes("<json-objects>")) {
|
|
40
|
-
return {
|
|
41
|
-
data: { title: "merged" },
|
|
42
|
-
usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 },
|
|
43
|
-
};
|
|
44
|
-
}
|
|
45
|
-
return {
|
|
46
|
-
data: { title: "chunk" },
|
|
47
|
-
usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 },
|
|
48
|
-
};
|
|
49
|
-
}) as any,
|
|
50
|
-
});
|
|
51
|
-
|
|
52
|
-
const options: ExtractionOptions<Output> = {
|
|
53
|
-
artifacts,
|
|
54
|
-
schema,
|
|
55
|
-
strategy,
|
|
56
|
-
};
|
|
57
|
-
|
|
58
|
-
const result = await strategy.run(options);
|
|
59
|
-
expect(result.data.title).toBe("merged");
|
|
60
|
-
expect(calls).toBe(3);
|
|
61
|
-
});
|
|
@@ -1,208 +0,0 @@
|
|
|
1
|
-
import type { ExtractionResult, ExtractionStrategy } from "../types";
|
|
2
|
-
import type { ExtractionOptions } from "../types";
|
|
3
|
-
import { buildExtractorPrompt } from "../prompts/ExtractorPrompt";
|
|
4
|
-
import { buildParallelMergerPrompt } from "../prompts/ParallelMergerPrompt";
|
|
5
|
-
import {
|
|
6
|
-
extractWithPrompt,
|
|
7
|
-
getBatches,
|
|
8
|
-
mergeUsage,
|
|
9
|
-
serializeSchema,
|
|
10
|
-
} from "./utils";
|
|
11
|
-
import { runConcurrently } from "./concurrency";
|
|
12
|
-
import { runWithRetries } from "../llm/RetryingRunner";
|
|
13
|
-
|
|
14
|
-
export type ParallelStrategyConfig = {
|
|
15
|
-
model: unknown;
|
|
16
|
-
mergeModel: unknown;
|
|
17
|
-
chunkSize: number;
|
|
18
|
-
concurrency?: number;
|
|
19
|
-
maxImages?: number;
|
|
20
|
-
outputInstructions?: string;
|
|
21
|
-
execute?: typeof runWithRetries;
|
|
22
|
-
strict?: boolean;
|
|
23
|
-
};
|
|
24
|
-
|
|
25
|
-
export class ParallelStrategy<T> implements ExtractionStrategy<T> {
|
|
26
|
-
public name = "parallel";
|
|
27
|
-
private config: ParallelStrategyConfig;
|
|
28
|
-
|
|
29
|
-
constructor(config: ParallelStrategyConfig) {
|
|
30
|
-
this.config = config;
|
|
31
|
-
}
|
|
32
|
-
|
|
33
|
-
getEstimatedSteps(artifacts: ExtractionOptions<T>["artifacts"]): number {
|
|
34
|
-
const batches = getBatches(artifacts, {
|
|
35
|
-
maxTokens: this.config.chunkSize,
|
|
36
|
-
maxImages: this.config.maxImages,
|
|
37
|
-
});
|
|
38
|
-
return batches.length + 3;
|
|
39
|
-
}
|
|
40
|
-
|
|
41
|
-
async run(options: ExtractionOptions<T>): Promise<ExtractionResult<T>> {
|
|
42
|
-
const debug = options.debug;
|
|
43
|
-
const { telemetry } = options;
|
|
44
|
-
|
|
45
|
-
// Create strategy-level span
|
|
46
|
-
const strategySpan = telemetry?.startSpan({
|
|
47
|
-
name: "strategy.parallel",
|
|
48
|
-
kind: "CHAIN",
|
|
49
|
-
attributes: {
|
|
50
|
-
"strategy.name": this.name,
|
|
51
|
-
"strategy.artifacts.count": options.artifacts.length,
|
|
52
|
-
"strategy.chunk_size": this.config.chunkSize,
|
|
53
|
-
"strategy.concurrency": this.config.concurrency,
|
|
54
|
-
},
|
|
55
|
-
});
|
|
56
|
-
|
|
57
|
-
const batches = getBatches(
|
|
58
|
-
options.artifacts,
|
|
59
|
-
{
|
|
60
|
-
maxTokens: this.config.chunkSize,
|
|
61
|
-
maxImages: this.config.maxImages,
|
|
62
|
-
},
|
|
63
|
-
debug,
|
|
64
|
-
telemetry ?? undefined,
|
|
65
|
-
strategySpan,
|
|
66
|
-
);
|
|
67
|
-
|
|
68
|
-
const schema = serializeSchema(options.schema);
|
|
69
|
-
const totalSteps = this.getEstimatedSteps(options.artifacts);
|
|
70
|
-
let step = 1;
|
|
71
|
-
|
|
72
|
-
// Emit start event
|
|
73
|
-
await options.events?.onStep?.({
|
|
74
|
-
step,
|
|
75
|
-
total: totalSteps,
|
|
76
|
-
label: batches.length > 1 ? `batch 1/${batches.length}` : "extract",
|
|
77
|
-
});
|
|
78
|
-
debug?.step({
|
|
79
|
-
step,
|
|
80
|
-
total: totalSteps,
|
|
81
|
-
label: batches.length > 1 ? `batch 1/${batches.length}` : "extract",
|
|
82
|
-
strategy: this.name,
|
|
83
|
-
});
|
|
84
|
-
|
|
85
|
-
const tasks = batches.map((batch, index) => async () => {
|
|
86
|
-
const prompt = buildExtractorPrompt(
|
|
87
|
-
batch,
|
|
88
|
-
schema,
|
|
89
|
-
this.config.outputInstructions,
|
|
90
|
-
);
|
|
91
|
-
const result = await extractWithPrompt<T>({
|
|
92
|
-
model: this.config.model,
|
|
93
|
-
schema: options.schema,
|
|
94
|
-
system: prompt.system,
|
|
95
|
-
user: prompt.user,
|
|
96
|
-
artifacts: batch,
|
|
97
|
-
events: options.events,
|
|
98
|
-
execute: this.config.execute as never,
|
|
99
|
-
strict: options.strict ?? this.config.strict,
|
|
100
|
-
debug,
|
|
101
|
-
callId: `parallel_batch_${index + 1}`,
|
|
102
|
-
telemetry: telemetry ?? undefined,
|
|
103
|
-
parentSpan: strategySpan,
|
|
104
|
-
});
|
|
105
|
-
// Emit progress after batch completes (if there are more batches)
|
|
106
|
-
const completedIndex = index + 1;
|
|
107
|
-
if (completedIndex < batches.length) {
|
|
108
|
-
step += 1;
|
|
109
|
-
await options.events?.onStep?.({
|
|
110
|
-
step,
|
|
111
|
-
total: totalSteps,
|
|
112
|
-
label: `batch ${completedIndex + 1}/${batches.length}`,
|
|
113
|
-
});
|
|
114
|
-
debug?.step({
|
|
115
|
-
step,
|
|
116
|
-
total: totalSteps,
|
|
117
|
-
label: `batch ${completedIndex + 1}/${batches.length}`,
|
|
118
|
-
strategy: this.name,
|
|
119
|
-
});
|
|
120
|
-
}
|
|
121
|
-
return result;
|
|
122
|
-
});
|
|
123
|
-
|
|
124
|
-
const results = await runConcurrently(
|
|
125
|
-
tasks,
|
|
126
|
-
this.config.concurrency ?? batches.length,
|
|
127
|
-
);
|
|
128
|
-
|
|
129
|
-
debug?.mergeStart({
|
|
130
|
-
mergeId: "parallel_merge",
|
|
131
|
-
inputCount: results.length,
|
|
132
|
-
strategy: this.name,
|
|
133
|
-
});
|
|
134
|
-
|
|
135
|
-
// Create merge span
|
|
136
|
-
const mergeSpan = telemetry?.startSpan({
|
|
137
|
-
name: "struktur.merge",
|
|
138
|
-
kind: "CHAIN",
|
|
139
|
-
parentSpan: strategySpan,
|
|
140
|
-
attributes: {
|
|
141
|
-
"merge.strategy": "parallel",
|
|
142
|
-
"merge.input_count": results.length,
|
|
143
|
-
},
|
|
144
|
-
});
|
|
145
|
-
|
|
146
|
-
const mergePrompt = buildParallelMergerPrompt(
|
|
147
|
-
schema,
|
|
148
|
-
results.map((r) => r.data),
|
|
149
|
-
);
|
|
150
|
-
const merged = await extractWithPrompt<T>({
|
|
151
|
-
model: this.config.mergeModel,
|
|
152
|
-
schema: options.schema,
|
|
153
|
-
system: mergePrompt.system,
|
|
154
|
-
user: mergePrompt.user,
|
|
155
|
-
artifacts: [],
|
|
156
|
-
events: options.events,
|
|
157
|
-
execute: this.config.execute as never,
|
|
158
|
-
strict: this.config.strict,
|
|
159
|
-
debug,
|
|
160
|
-
callId: "parallel_merge",
|
|
161
|
-
telemetry: telemetry ?? undefined,
|
|
162
|
-
parentSpan: mergeSpan,
|
|
163
|
-
});
|
|
164
|
-
|
|
165
|
-
step += 1;
|
|
166
|
-
await options.events?.onStep?.({
|
|
167
|
-
step,
|
|
168
|
-
total: totalSteps,
|
|
169
|
-
label: "merge",
|
|
170
|
-
});
|
|
171
|
-
debug?.step({
|
|
172
|
-
step,
|
|
173
|
-
total: totalSteps,
|
|
174
|
-
label: "merge",
|
|
175
|
-
strategy: this.name,
|
|
176
|
-
});
|
|
177
|
-
debug?.mergeComplete({ mergeId: "parallel_merge", success: true });
|
|
178
|
-
|
|
179
|
-
// End merge span
|
|
180
|
-
if (mergeSpan && telemetry) {
|
|
181
|
-
telemetry.recordEvent(mergeSpan, {
|
|
182
|
-
type: "merge",
|
|
183
|
-
strategy: "parallel",
|
|
184
|
-
inputCount: results.length,
|
|
185
|
-
outputCount: 1,
|
|
186
|
-
});
|
|
187
|
-
telemetry.endSpan(mergeSpan, {
|
|
188
|
-
status: "ok",
|
|
189
|
-
output: merged.data,
|
|
190
|
-
});
|
|
191
|
-
}
|
|
192
|
-
|
|
193
|
-
// End strategy span
|
|
194
|
-
telemetry?.endSpan(strategySpan!, {
|
|
195
|
-
status: "ok",
|
|
196
|
-
output: merged.data,
|
|
197
|
-
});
|
|
198
|
-
|
|
199
|
-
return {
|
|
200
|
-
data: merged.data,
|
|
201
|
-
usage: mergeUsage([...results.map((r) => r.usage), merged.usage]),
|
|
202
|
-
};
|
|
203
|
-
}
|
|
204
|
-
}
|
|
205
|
-
|
|
206
|
-
export const parallel = <T>(config: ParallelStrategyConfig) => {
|
|
207
|
-
return new ParallelStrategy<T>(config);
|
|
208
|
-
};
|
|
@@ -1,66 +0,0 @@
|
|
|
1
|
-
import { test, expect } from "bun:test";
|
|
2
|
-
import type { JSONSchemaType } from "ajv";
|
|
3
|
-
import { SequentialAutoMergeStrategy } from "./SequentialAutoMergeStrategy";
|
|
4
|
-
import type { Artifact, ExtractionOptions } from "../types";
|
|
5
|
-
|
|
6
|
-
type Output = { items: Array<{ id: number }> };
|
|
7
|
-
|
|
8
|
-
const schema: JSONSchemaType<Output> = {
|
|
9
|
-
type: "object",
|
|
10
|
-
properties: {
|
|
11
|
-
items: {
|
|
12
|
-
type: "array",
|
|
13
|
-
items: {
|
|
14
|
-
type: "object",
|
|
15
|
-
properties: { id: { type: "number" } },
|
|
16
|
-
required: ["id"],
|
|
17
|
-
additionalProperties: false,
|
|
18
|
-
},
|
|
19
|
-
},
|
|
20
|
-
},
|
|
21
|
-
required: ["items"],
|
|
22
|
-
additionalProperties: false,
|
|
23
|
-
};
|
|
24
|
-
|
|
25
|
-
const artifacts: Artifact[] = [
|
|
26
|
-
{
|
|
27
|
-
id: "a1",
|
|
28
|
-
type: "text",
|
|
29
|
-
raw: async () => Buffer.from(""),
|
|
30
|
-
contents: [{ text: "abcdefgh" }],
|
|
31
|
-
},
|
|
32
|
-
{
|
|
33
|
-
id: "a2",
|
|
34
|
-
type: "text",
|
|
35
|
-
raw: async () => Buffer.from(""),
|
|
36
|
-
contents: [{ text: "abcdefgh" }],
|
|
37
|
-
},
|
|
38
|
-
];
|
|
39
|
-
|
|
40
|
-
test("SequentialAutoMergeStrategy merges and dedupes", async () => {
|
|
41
|
-
const strategy = new SequentialAutoMergeStrategy<Output>({
|
|
42
|
-
model: {},
|
|
43
|
-
chunkSize: 2,
|
|
44
|
-
execute: (async () => {
|
|
45
|
-
return {
|
|
46
|
-
data: { items: [{ id: 1 }, { id: 1 }] },
|
|
47
|
-
usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 },
|
|
48
|
-
};
|
|
49
|
-
}) as any,
|
|
50
|
-
dedupeExecute: (async () => {
|
|
51
|
-
return {
|
|
52
|
-
data: { keys: [] },
|
|
53
|
-
usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 },
|
|
54
|
-
};
|
|
55
|
-
}) as any,
|
|
56
|
-
});
|
|
57
|
-
|
|
58
|
-
const options: ExtractionOptions<Output> = {
|
|
59
|
-
artifacts,
|
|
60
|
-
schema,
|
|
61
|
-
strategy,
|
|
62
|
-
};
|
|
63
|
-
|
|
64
|
-
const result = await strategy.run(options);
|
|
65
|
-
expect(result.data.items.length).toBe(1);
|
|
66
|
-
});
|