@struktur/sdk 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +79 -0
- package/package.json +33 -0
- package/src/artifacts/AGENTS.md +16 -0
- package/src/artifacts/fileToArtifact.test.ts +37 -0
- package/src/artifacts/fileToArtifact.ts +44 -0
- package/src/artifacts/input.test.ts +243 -0
- package/src/artifacts/input.ts +360 -0
- package/src/artifacts/providers.test.ts +19 -0
- package/src/artifacts/providers.ts +7 -0
- package/src/artifacts/urlToArtifact.test.ts +23 -0
- package/src/artifacts/urlToArtifact.ts +19 -0
- package/src/auth/AGENTS.md +11 -0
- package/src/auth/config.test.ts +132 -0
- package/src/auth/config.ts +129 -0
- package/src/auth/tokens.test.ts +58 -0
- package/src/auth/tokens.ts +229 -0
- package/src/chunking/AGENTS.md +11 -0
- package/src/chunking/ArtifactBatcher.test.ts +22 -0
- package/src/chunking/ArtifactBatcher.ts +110 -0
- package/src/chunking/ArtifactSplitter.test.ts +38 -0
- package/src/chunking/ArtifactSplitter.ts +151 -0
- package/src/debug/AGENTS.md +79 -0
- package/src/debug/logger.test.ts +244 -0
- package/src/debug/logger.ts +211 -0
- package/src/extract.test.ts +22 -0
- package/src/extract.ts +114 -0
- package/src/fields.test.ts +663 -0
- package/src/fields.ts +239 -0
- package/src/index.test.ts +20 -0
- package/src/index.ts +93 -0
- package/src/llm/AGENTS.md +9 -0
- package/src/llm/LLMClient.test.ts +196 -0
- package/src/llm/LLMClient.ts +106 -0
- package/src/llm/RetryingRunner.test.ts +174 -0
- package/src/llm/RetryingRunner.ts +188 -0
- package/src/llm/message.test.ts +42 -0
- package/src/llm/message.ts +47 -0
- package/src/llm/models.test.ts +82 -0
- package/src/llm/models.ts +190 -0
- package/src/merge/AGENTS.md +6 -0
- package/src/merge/Deduplicator.test.ts +108 -0
- package/src/merge/Deduplicator.ts +45 -0
- package/src/merge/SmartDataMerger.test.ts +177 -0
- package/src/merge/SmartDataMerger.ts +56 -0
- package/src/parsers/AGENTS.md +58 -0
- package/src/parsers/collect.test.ts +56 -0
- package/src/parsers/collect.ts +31 -0
- package/src/parsers/index.ts +6 -0
- package/src/parsers/mime.test.ts +91 -0
- package/src/parsers/mime.ts +137 -0
- package/src/parsers/npm.ts +26 -0
- package/src/parsers/pdf.test.ts +394 -0
- package/src/parsers/pdf.ts +194 -0
- package/src/parsers/runner.test.ts +95 -0
- package/src/parsers/runner.ts +177 -0
- package/src/parsers/types.ts +29 -0
- package/src/prompts/AGENTS.md +8 -0
- package/src/prompts/DeduplicationPrompt.test.ts +41 -0
- package/src/prompts/DeduplicationPrompt.ts +37 -0
- package/src/prompts/ExtractorPrompt.test.ts +21 -0
- package/src/prompts/ExtractorPrompt.ts +72 -0
- package/src/prompts/ParallelMergerPrompt.test.ts +8 -0
- package/src/prompts/ParallelMergerPrompt.ts +37 -0
- package/src/prompts/SequentialExtractorPrompt.test.ts +24 -0
- package/src/prompts/SequentialExtractorPrompt.ts +82 -0
- package/src/prompts/formatArtifacts.test.ts +39 -0
- package/src/prompts/formatArtifacts.ts +46 -0
- package/src/strategies/AGENTS.md +6 -0
- package/src/strategies/DoublePassAutoMergeStrategy.test.ts +53 -0
- package/src/strategies/DoublePassAutoMergeStrategy.ts +270 -0
- package/src/strategies/DoublePassStrategy.test.ts +48 -0
- package/src/strategies/DoublePassStrategy.ts +179 -0
- package/src/strategies/ParallelAutoMergeStrategy.test.ts +152 -0
- package/src/strategies/ParallelAutoMergeStrategy.ts +241 -0
- package/src/strategies/ParallelStrategy.test.ts +61 -0
- package/src/strategies/ParallelStrategy.ts +157 -0
- package/src/strategies/SequentialAutoMergeStrategy.test.ts +66 -0
- package/src/strategies/SequentialAutoMergeStrategy.ts +222 -0
- package/src/strategies/SequentialStrategy.test.ts +53 -0
- package/src/strategies/SequentialStrategy.ts +119 -0
- package/src/strategies/SimpleStrategy.test.ts +46 -0
- package/src/strategies/SimpleStrategy.ts +74 -0
- package/src/strategies/concurrency.test.ts +16 -0
- package/src/strategies/concurrency.ts +14 -0
- package/src/strategies/index.test.ts +20 -0
- package/src/strategies/index.ts +7 -0
- package/src/strategies/utils.test.ts +76 -0
- package/src/strategies/utils.ts +56 -0
- package/src/tokenization.test.ts +119 -0
- package/src/tokenization.ts +71 -0
- package/src/types.test.ts +25 -0
- package/src/types.ts +116 -0
- package/src/validation/AGENTS.md +6 -0
- package/src/validation/validator.test.ts +172 -0
- package/src/validation/validator.ts +82 -0
- package/tsconfig.json +22 -0
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
import { test, expect } from "bun:test";
|
|
2
|
+
import type { JSONSchemaType } from "ajv";
|
|
3
|
+
import { DoublePassAutoMergeStrategy } from "./DoublePassAutoMergeStrategy";
|
|
4
|
+
import type { Artifact, ExtractionOptions } from "../types";
|
|
5
|
+
|
|
6
|
+
type Output = { title: string };
|
|
7
|
+
|
|
8
|
+
const schema: JSONSchemaType<Output> = {
|
|
9
|
+
type: "object",
|
|
10
|
+
properties: { title: { type: "string" } },
|
|
11
|
+
required: ["title"],
|
|
12
|
+
additionalProperties: false,
|
|
13
|
+
};
|
|
14
|
+
|
|
15
|
+
const artifacts: Artifact[] = [
|
|
16
|
+
{
|
|
17
|
+
id: "a1",
|
|
18
|
+
type: "text",
|
|
19
|
+
raw: async () => Buffer.from(""),
|
|
20
|
+
contents: [{ text: "abcdefgh" }],
|
|
21
|
+
},
|
|
22
|
+
];
|
|
23
|
+
|
|
24
|
+
test("DoublePassAutoMergeStrategy runs both passes", async () => {
|
|
25
|
+
let calls = 0;
|
|
26
|
+
const strategy = new DoublePassAutoMergeStrategy<Output>({
|
|
27
|
+
model: {},
|
|
28
|
+
chunkSize: 10,
|
|
29
|
+
execute: (async () => {
|
|
30
|
+
calls += 1;
|
|
31
|
+
return {
|
|
32
|
+
data: { title: `pass-${calls}` },
|
|
33
|
+
usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 },
|
|
34
|
+
};
|
|
35
|
+
}) as any,
|
|
36
|
+
dedupeExecute: (async () => {
|
|
37
|
+
return {
|
|
38
|
+
data: { keys: [] },
|
|
39
|
+
usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 },
|
|
40
|
+
};
|
|
41
|
+
}) as any,
|
|
42
|
+
});
|
|
43
|
+
|
|
44
|
+
const options: ExtractionOptions<Output> = {
|
|
45
|
+
artifacts,
|
|
46
|
+
schema,
|
|
47
|
+
strategy,
|
|
48
|
+
};
|
|
49
|
+
|
|
50
|
+
const result = await strategy.run(options);
|
|
51
|
+
expect(result.data.title).toBe("pass-2");
|
|
52
|
+
expect(calls).toBe(2);
|
|
53
|
+
});
|
|
@@ -0,0 +1,270 @@
|
|
|
1
|
+
import type { ExtractionResult, ExtractionStrategy } from "../types";
|
|
2
|
+
import type { ExtractionOptions } from "../types";
|
|
3
|
+
import { buildExtractorPrompt } from "../prompts/ExtractorPrompt";
|
|
4
|
+
import { buildDeduplicationPrompt } from "../prompts/DeduplicationPrompt";
|
|
5
|
+
import { buildSequentialPrompt } from "../prompts/SequentialExtractorPrompt";
|
|
6
|
+
import {
|
|
7
|
+
extractWithPrompt,
|
|
8
|
+
getBatches,
|
|
9
|
+
mergeUsage,
|
|
10
|
+
serializeSchema,
|
|
11
|
+
} from "./utils";
|
|
12
|
+
import { SmartDataMerger } from "../merge/SmartDataMerger";
|
|
13
|
+
import {
|
|
14
|
+
findExactDuplicatesWithHashing,
|
|
15
|
+
deduplicateByIndices,
|
|
16
|
+
} from "../merge/Deduplicator";
|
|
17
|
+
import { runConcurrently } from "./concurrency";
|
|
18
|
+
import { runWithRetries } from "../llm/RetryingRunner";
|
|
19
|
+
|
|
20
|
+
export type DoublePassAutoMergeStrategyConfig = {
|
|
21
|
+
model: unknown;
|
|
22
|
+
chunkSize: number;
|
|
23
|
+
concurrency?: number;
|
|
24
|
+
maxImages?: number;
|
|
25
|
+
outputInstructions?: string;
|
|
26
|
+
dedupeModel?: unknown;
|
|
27
|
+
execute?: typeof runWithRetries;
|
|
28
|
+
dedupeExecute?: typeof runWithRetries;
|
|
29
|
+
strict?: boolean;
|
|
30
|
+
};
|
|
31
|
+
|
|
32
|
+
const dedupeSchema = {
|
|
33
|
+
type: "object",
|
|
34
|
+
properties: {
|
|
35
|
+
keys: { type: "array", items: { type: "string" } },
|
|
36
|
+
},
|
|
37
|
+
required: ["keys"],
|
|
38
|
+
additionalProperties: false,
|
|
39
|
+
} as const;
|
|
40
|
+
|
|
41
|
+
const dedupeArrays = (data: Record<string, unknown>) => {
|
|
42
|
+
const result: Record<string, unknown> = { ...data };
|
|
43
|
+
for (const [key, value] of Object.entries(result)) {
|
|
44
|
+
if (Array.isArray(value)) {
|
|
45
|
+
const duplicates = findExactDuplicatesWithHashing(value);
|
|
46
|
+
result[key] = deduplicateByIndices(value, duplicates);
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
return result;
|
|
50
|
+
};
|
|
51
|
+
|
|
52
|
+
const removeByPath = (data: Record<string, unknown>, path: string) => {
|
|
53
|
+
const [root, indexStr] = path.split(".");
|
|
54
|
+
const index = Number(indexStr);
|
|
55
|
+
if (!root || Number.isNaN(index)) {
|
|
56
|
+
return data;
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
const value = data[root];
|
|
60
|
+
if (!Array.isArray(value)) {
|
|
61
|
+
return data;
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
const next = [...value];
|
|
65
|
+
next.splice(index, 1);
|
|
66
|
+
return { ...data, [root]: next };
|
|
67
|
+
};
|
|
68
|
+
|
|
69
|
+
export class DoublePassAutoMergeStrategy<T> implements ExtractionStrategy<T> {
|
|
70
|
+
public name = "double-pass-auto-merge";
|
|
71
|
+
private config: DoublePassAutoMergeStrategyConfig;
|
|
72
|
+
|
|
73
|
+
constructor(config: DoublePassAutoMergeStrategyConfig) {
|
|
74
|
+
this.config = config;
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
getEstimatedSteps(artifacts: ExtractionOptions<T>["artifacts"]): number {
|
|
78
|
+
const batches = getBatches(artifacts, {
|
|
79
|
+
maxTokens: this.config.chunkSize,
|
|
80
|
+
maxImages: this.config.maxImages,
|
|
81
|
+
});
|
|
82
|
+
return batches.length * 2 + 3;
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
async run(options: ExtractionOptions<T>): Promise<ExtractionResult<T>> {
|
|
86
|
+
const debug = options.debug;
|
|
87
|
+
const batches = getBatches(
|
|
88
|
+
options.artifacts,
|
|
89
|
+
{
|
|
90
|
+
maxTokens: this.config.chunkSize,
|
|
91
|
+
maxImages: this.config.maxImages,
|
|
92
|
+
},
|
|
93
|
+
debug,
|
|
94
|
+
);
|
|
95
|
+
|
|
96
|
+
const schema = serializeSchema(options.schema);
|
|
97
|
+
const totalSteps = this.getEstimatedSteps(options.artifacts);
|
|
98
|
+
let step = 1;
|
|
99
|
+
|
|
100
|
+
const tasks = batches.map((batch, index) => async () => {
|
|
101
|
+
const prompt = buildExtractorPrompt(
|
|
102
|
+
batch,
|
|
103
|
+
schema,
|
|
104
|
+
this.config.outputInstructions,
|
|
105
|
+
);
|
|
106
|
+
const result = await extractWithPrompt<T>({
|
|
107
|
+
model: this.config.model,
|
|
108
|
+
schema: options.schema,
|
|
109
|
+
system: prompt.system,
|
|
110
|
+
user: prompt.user,
|
|
111
|
+
artifacts: batch,
|
|
112
|
+
events: options.events,
|
|
113
|
+
execute: this.config.execute as never,
|
|
114
|
+
strict: options.strict ?? this.config.strict,
|
|
115
|
+
debug,
|
|
116
|
+
callId: `double_pass_auto_1_batch_${index + 1}`,
|
|
117
|
+
});
|
|
118
|
+
step += 1;
|
|
119
|
+
await options.events?.onStep?.({
|
|
120
|
+
step,
|
|
121
|
+
total: totalSteps,
|
|
122
|
+
label: `pass 1 batch ${index + 1}/${batches.length}`,
|
|
123
|
+
});
|
|
124
|
+
debug?.step({
|
|
125
|
+
step,
|
|
126
|
+
total: totalSteps,
|
|
127
|
+
label: `pass 1 batch ${index + 1}/${batches.length}`,
|
|
128
|
+
strategy: this.name,
|
|
129
|
+
});
|
|
130
|
+
return result;
|
|
131
|
+
});
|
|
132
|
+
|
|
133
|
+
const results = await runConcurrently(
|
|
134
|
+
tasks,
|
|
135
|
+
this.config.concurrency ?? batches.length,
|
|
136
|
+
);
|
|
137
|
+
|
|
138
|
+
const merger = new SmartDataMerger(
|
|
139
|
+
options.schema as Record<string, unknown>,
|
|
140
|
+
);
|
|
141
|
+
let merged = {} as Record<string, unknown>;
|
|
142
|
+
|
|
143
|
+
debug?.mergeStart({
|
|
144
|
+
mergeId: "double_pass_auto_merge",
|
|
145
|
+
inputCount: results.length,
|
|
146
|
+
strategy: this.name,
|
|
147
|
+
});
|
|
148
|
+
|
|
149
|
+
for (let i = 0; i < results.length; i++) {
|
|
150
|
+
const result = results[i]!;
|
|
151
|
+
merged = merger.merge(merged, result.data as Record<string, unknown>);
|
|
152
|
+
|
|
153
|
+
// Log merge operation per field
|
|
154
|
+
for (const key of Object.keys(result.data as Record<string, unknown>)) {
|
|
155
|
+
const leftArray = Array.isArray(merged[key])
|
|
156
|
+
? (merged[key] as unknown[]).length
|
|
157
|
+
: undefined;
|
|
158
|
+
const rightArray = Array.isArray(
|
|
159
|
+
(result.data as Record<string, unknown>)[key],
|
|
160
|
+
)
|
|
161
|
+
? ((result.data as Record<string, unknown>)[key] as unknown[]).length
|
|
162
|
+
: undefined;
|
|
163
|
+
|
|
164
|
+
debug?.smartMergeField({
|
|
165
|
+
mergeId: "double_pass_auto_merge",
|
|
166
|
+
field: key,
|
|
167
|
+
operation: "merge_arrays",
|
|
168
|
+
leftCount: leftArray,
|
|
169
|
+
rightCount: rightArray,
|
|
170
|
+
});
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
debug?.mergeComplete({ mergeId: "double_pass_auto_merge", success: true });
|
|
175
|
+
|
|
176
|
+
merged = dedupeArrays(merged);
|
|
177
|
+
|
|
178
|
+
const dedupePrompt = buildDeduplicationPrompt(schema, merged);
|
|
179
|
+
|
|
180
|
+
debug?.dedupeStart({
|
|
181
|
+
dedupeId: "double_pass_auto_dedupe",
|
|
182
|
+
itemCount: Object.keys(merged).length,
|
|
183
|
+
});
|
|
184
|
+
|
|
185
|
+
const dedupeResponse = await runWithRetries<{ keys: string[] }>({
|
|
186
|
+
model: this.config.dedupeModel ?? this.config.model,
|
|
187
|
+
schema: dedupeSchema,
|
|
188
|
+
system: dedupePrompt.system,
|
|
189
|
+
user: dedupePrompt.user,
|
|
190
|
+
events: options.events,
|
|
191
|
+
execute: this.config.dedupeExecute,
|
|
192
|
+
strict: this.config.strict,
|
|
193
|
+
debug,
|
|
194
|
+
callId: "double_pass_auto_dedupe",
|
|
195
|
+
});
|
|
196
|
+
|
|
197
|
+
step += 1;
|
|
198
|
+
await options.events?.onStep?.({
|
|
199
|
+
step,
|
|
200
|
+
total: totalSteps,
|
|
201
|
+
label: "pass 1 dedupe",
|
|
202
|
+
});
|
|
203
|
+
debug?.step({
|
|
204
|
+
step,
|
|
205
|
+
total: totalSteps,
|
|
206
|
+
label: "pass 1 dedupe",
|
|
207
|
+
strategy: this.name,
|
|
208
|
+
});
|
|
209
|
+
|
|
210
|
+
let deduped = merged;
|
|
211
|
+
for (const key of dedupeResponse.data.keys) {
|
|
212
|
+
deduped = removeByPath(deduped, key);
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
debug?.dedupeComplete({
|
|
216
|
+
dedupeId: "double_pass_auto_dedupe",
|
|
217
|
+
duplicatesFound: dedupeResponse.data.keys.length,
|
|
218
|
+
itemsRemoved: dedupeResponse.data.keys.length,
|
|
219
|
+
});
|
|
220
|
+
|
|
221
|
+
let currentData = deduped as T;
|
|
222
|
+
const usages = [...results.map((r) => r.usage), dedupeResponse.usage];
|
|
223
|
+
|
|
224
|
+
for (const [index, batch] of batches.entries()) {
|
|
225
|
+
const prompt = buildSequentialPrompt(
|
|
226
|
+
batch,
|
|
227
|
+
schema,
|
|
228
|
+
JSON.stringify(currentData),
|
|
229
|
+
this.config.outputInstructions,
|
|
230
|
+
);
|
|
231
|
+
|
|
232
|
+
const result = await extractWithPrompt<T>({
|
|
233
|
+
model: this.config.model,
|
|
234
|
+
schema: options.schema,
|
|
235
|
+
system: prompt.system,
|
|
236
|
+
user: prompt.user,
|
|
237
|
+
artifacts: batch,
|
|
238
|
+
events: options.events,
|
|
239
|
+
execute: this.config.execute as never,
|
|
240
|
+
strict: this.config.strict,
|
|
241
|
+
debug,
|
|
242
|
+
callId: `double_pass_auto_2_batch_${index + 1}`,
|
|
243
|
+
});
|
|
244
|
+
|
|
245
|
+
currentData = result.data;
|
|
246
|
+
usages.push(result.usage);
|
|
247
|
+
|
|
248
|
+
step += 1;
|
|
249
|
+
await options.events?.onStep?.({
|
|
250
|
+
step,
|
|
251
|
+
total: totalSteps,
|
|
252
|
+
label: `pass 2 batch ${index + 1}/${batches.length}`,
|
|
253
|
+
});
|
|
254
|
+
debug?.step({
|
|
255
|
+
step,
|
|
256
|
+
total: totalSteps,
|
|
257
|
+
label: `pass 2 batch ${index + 1}/${batches.length}`,
|
|
258
|
+
strategy: this.name,
|
|
259
|
+
});
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
return { data: currentData, usage: mergeUsage(usages) };
|
|
263
|
+
}
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
export const doublePassAutoMerge = <T>(
|
|
267
|
+
config: DoublePassAutoMergeStrategyConfig,
|
|
268
|
+
) => {
|
|
269
|
+
return new DoublePassAutoMergeStrategy<T>(config);
|
|
270
|
+
};
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
import { test, expect } from "bun:test";
|
|
2
|
+
import type { JSONSchemaType } from "ajv";
|
|
3
|
+
import { DoublePassStrategy } from "./DoublePassStrategy";
|
|
4
|
+
import type { Artifact, ExtractionOptions } from "../types";
|
|
5
|
+
|
|
6
|
+
type Output = { title: string };
|
|
7
|
+
|
|
8
|
+
const schema: JSONSchemaType<Output> = {
|
|
9
|
+
type: "object",
|
|
10
|
+
properties: { title: { type: "string" } },
|
|
11
|
+
required: ["title"],
|
|
12
|
+
additionalProperties: false,
|
|
13
|
+
};
|
|
14
|
+
|
|
15
|
+
const artifacts: Artifact[] = [
|
|
16
|
+
{
|
|
17
|
+
id: "a1",
|
|
18
|
+
type: "text",
|
|
19
|
+
raw: async () => Buffer.from(""),
|
|
20
|
+
contents: [{ text: "abcdefgh" }],
|
|
21
|
+
},
|
|
22
|
+
];
|
|
23
|
+
|
|
24
|
+
test("DoublePassStrategy runs second pass", async () => {
|
|
25
|
+
let calls = 0;
|
|
26
|
+
const strategy = new DoublePassStrategy<Output>({
|
|
27
|
+
model: {},
|
|
28
|
+
mergeModel: {},
|
|
29
|
+
chunkSize: 10,
|
|
30
|
+
execute: (async () => {
|
|
31
|
+
calls += 1;
|
|
32
|
+
return {
|
|
33
|
+
data: { title: `pass-${calls}` },
|
|
34
|
+
usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 },
|
|
35
|
+
};
|
|
36
|
+
}) as any,
|
|
37
|
+
});
|
|
38
|
+
|
|
39
|
+
const options: ExtractionOptions<Output> = {
|
|
40
|
+
artifacts,
|
|
41
|
+
schema,
|
|
42
|
+
strategy,
|
|
43
|
+
};
|
|
44
|
+
|
|
45
|
+
const result = await strategy.run(options);
|
|
46
|
+
expect(result.data.title).toBe("pass-3");
|
|
47
|
+
expect(calls).toBe(3);
|
|
48
|
+
});
|
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
import type { ExtractionResult, ExtractionStrategy } from "../types";
|
|
2
|
+
import type { ExtractionOptions } from "../types";
|
|
3
|
+
import { buildExtractorPrompt } from "../prompts/ExtractorPrompt";
|
|
4
|
+
import { buildParallelMergerPrompt } from "../prompts/ParallelMergerPrompt";
|
|
5
|
+
import { buildSequentialPrompt } from "../prompts/SequentialExtractorPrompt";
|
|
6
|
+
import {
|
|
7
|
+
extractWithPrompt,
|
|
8
|
+
getBatches,
|
|
9
|
+
mergeUsage,
|
|
10
|
+
serializeSchema,
|
|
11
|
+
} from "./utils";
|
|
12
|
+
import { runConcurrently } from "./concurrency";
|
|
13
|
+
import { runWithRetries } from "../llm/RetryingRunner";
|
|
14
|
+
|
|
15
|
+
export type DoublePassStrategyConfig = {
|
|
16
|
+
model: unknown;
|
|
17
|
+
mergeModel: unknown;
|
|
18
|
+
chunkSize: number;
|
|
19
|
+
concurrency?: number;
|
|
20
|
+
maxImages?: number;
|
|
21
|
+
outputInstructions?: string;
|
|
22
|
+
execute?: typeof runWithRetries;
|
|
23
|
+
strict?: boolean;
|
|
24
|
+
};
|
|
25
|
+
|
|
26
|
+
export class DoublePassStrategy<T> implements ExtractionStrategy<T> {
|
|
27
|
+
public name = "double-pass";
|
|
28
|
+
private config: DoublePassStrategyConfig;
|
|
29
|
+
|
|
30
|
+
constructor(config: DoublePassStrategyConfig) {
|
|
31
|
+
this.config = config;
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
getEstimatedSteps(artifacts: ExtractionOptions<T>["artifacts"]): number {
|
|
35
|
+
const batches = getBatches(artifacts, {
|
|
36
|
+
maxTokens: this.config.chunkSize,
|
|
37
|
+
maxImages: this.config.maxImages,
|
|
38
|
+
});
|
|
39
|
+
return batches.length * 2 + 3;
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
async run(options: ExtractionOptions<T>): Promise<ExtractionResult<T>> {
|
|
43
|
+
const debug = options.debug;
|
|
44
|
+
const batches = getBatches(
|
|
45
|
+
options.artifacts,
|
|
46
|
+
{
|
|
47
|
+
maxTokens: this.config.chunkSize,
|
|
48
|
+
maxImages: this.config.maxImages,
|
|
49
|
+
},
|
|
50
|
+
debug,
|
|
51
|
+
);
|
|
52
|
+
|
|
53
|
+
const schema = serializeSchema(options.schema);
|
|
54
|
+
const totalSteps = this.getEstimatedSteps(options.artifacts);
|
|
55
|
+
let step = 1;
|
|
56
|
+
|
|
57
|
+
const tasks = batches.map((batch, index) => async () => {
|
|
58
|
+
const prompt = buildExtractorPrompt(
|
|
59
|
+
batch,
|
|
60
|
+
schema,
|
|
61
|
+
this.config.outputInstructions,
|
|
62
|
+
);
|
|
63
|
+
const result = await extractWithPrompt<T>({
|
|
64
|
+
model: this.config.model,
|
|
65
|
+
schema: options.schema,
|
|
66
|
+
system: prompt.system,
|
|
67
|
+
user: prompt.user,
|
|
68
|
+
artifacts: batch,
|
|
69
|
+
events: options.events,
|
|
70
|
+
execute: this.config.execute as never,
|
|
71
|
+
strict: options.strict ?? this.config.strict,
|
|
72
|
+
debug,
|
|
73
|
+
callId: `double_pass_1_batch_${index + 1}`,
|
|
74
|
+
});
|
|
75
|
+
step += 1;
|
|
76
|
+
await options.events?.onStep?.({
|
|
77
|
+
step,
|
|
78
|
+
total: totalSteps,
|
|
79
|
+
label: `pass 1 batch ${index + 1}/${batches.length}`,
|
|
80
|
+
});
|
|
81
|
+
debug?.step({
|
|
82
|
+
step,
|
|
83
|
+
total: totalSteps,
|
|
84
|
+
label: `pass 1 batch ${index + 1}/${batches.length}`,
|
|
85
|
+
strategy: this.name,
|
|
86
|
+
});
|
|
87
|
+
return result;
|
|
88
|
+
});
|
|
89
|
+
|
|
90
|
+
const results = await runConcurrently(
|
|
91
|
+
tasks,
|
|
92
|
+
this.config.concurrency ?? batches.length,
|
|
93
|
+
);
|
|
94
|
+
|
|
95
|
+
debug?.mergeStart({
|
|
96
|
+
mergeId: "double_pass_1_merge",
|
|
97
|
+
inputCount: results.length,
|
|
98
|
+
strategy: this.name,
|
|
99
|
+
});
|
|
100
|
+
|
|
101
|
+
const mergePrompt = buildParallelMergerPrompt(
|
|
102
|
+
schema,
|
|
103
|
+
results.map((r) => r.data),
|
|
104
|
+
);
|
|
105
|
+
const merged = await extractWithPrompt<T>({
|
|
106
|
+
model: this.config.mergeModel,
|
|
107
|
+
schema: options.schema,
|
|
108
|
+
system: mergePrompt.system,
|
|
109
|
+
user: mergePrompt.user,
|
|
110
|
+
artifacts: [],
|
|
111
|
+
events: options.events,
|
|
112
|
+
execute: this.config.execute as never,
|
|
113
|
+
strict: this.config.strict,
|
|
114
|
+
debug,
|
|
115
|
+
callId: "double_pass_1_merge",
|
|
116
|
+
});
|
|
117
|
+
|
|
118
|
+
step += 1;
|
|
119
|
+
await options.events?.onStep?.({
|
|
120
|
+
step,
|
|
121
|
+
total: totalSteps,
|
|
122
|
+
label: "pass 1 merge",
|
|
123
|
+
});
|
|
124
|
+
debug?.step({
|
|
125
|
+
step,
|
|
126
|
+
total: totalSteps,
|
|
127
|
+
label: "pass 1 merge",
|
|
128
|
+
strategy: this.name,
|
|
129
|
+
});
|
|
130
|
+
debug?.mergeComplete({ mergeId: "double_pass_1_merge", success: true });
|
|
131
|
+
|
|
132
|
+
let currentData = merged.data;
|
|
133
|
+
const usages = [...results.map((r) => r.usage), merged.usage];
|
|
134
|
+
|
|
135
|
+
for (const [index, batch] of batches.entries()) {
|
|
136
|
+
const prompt = buildSequentialPrompt(
|
|
137
|
+
batch,
|
|
138
|
+
schema,
|
|
139
|
+
JSON.stringify(currentData),
|
|
140
|
+
this.config.outputInstructions,
|
|
141
|
+
);
|
|
142
|
+
|
|
143
|
+
const result = await extractWithPrompt<T>({
|
|
144
|
+
model: this.config.model,
|
|
145
|
+
schema: options.schema,
|
|
146
|
+
system: prompt.system,
|
|
147
|
+
user: prompt.user,
|
|
148
|
+
artifacts: batch,
|
|
149
|
+
events: options.events,
|
|
150
|
+
execute: this.config.execute as never,
|
|
151
|
+
strict: this.config.strict,
|
|
152
|
+
debug,
|
|
153
|
+
callId: `double_pass_2_batch_${index + 1}`,
|
|
154
|
+
});
|
|
155
|
+
|
|
156
|
+
currentData = result.data;
|
|
157
|
+
usages.push(result.usage);
|
|
158
|
+
|
|
159
|
+
step += 1;
|
|
160
|
+
await options.events?.onStep?.({
|
|
161
|
+
step,
|
|
162
|
+
total: totalSteps,
|
|
163
|
+
label: `pass 2 batch ${index + 1}/${batches.length}`,
|
|
164
|
+
});
|
|
165
|
+
debug?.step({
|
|
166
|
+
step,
|
|
167
|
+
total: totalSteps,
|
|
168
|
+
label: `pass 2 batch ${index + 1}/${batches.length}`,
|
|
169
|
+
strategy: this.name,
|
|
170
|
+
});
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
return { data: currentData, usage: mergeUsage(usages) };
|
|
174
|
+
}
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
export const doublePass = <T>(config: DoublePassStrategyConfig) => {
|
|
178
|
+
return new DoublePassStrategy<T>(config);
|
|
179
|
+
};
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
import { test, expect } from "bun:test";
|
|
2
|
+
import type { JSONSchemaType } from "ajv";
|
|
3
|
+
import { ParallelAutoMergeStrategy, __testing__ } from "./ParallelAutoMergeStrategy";
|
|
4
|
+
import type { Artifact, ExtractionOptions } from "../types";
|
|
5
|
+
|
|
6
|
+
type Output = { items: Array<{ id: number }> };
|
|
7
|
+
|
|
8
|
+
const schema: JSONSchemaType<Output> = {
|
|
9
|
+
type: "object",
|
|
10
|
+
properties: {
|
|
11
|
+
items: {
|
|
12
|
+
type: "array",
|
|
13
|
+
items: {
|
|
14
|
+
type: "object",
|
|
15
|
+
properties: { id: { type: "number" } },
|
|
16
|
+
required: ["id"],
|
|
17
|
+
additionalProperties: false,
|
|
18
|
+
},
|
|
19
|
+
},
|
|
20
|
+
},
|
|
21
|
+
required: ["items"],
|
|
22
|
+
additionalProperties: false,
|
|
23
|
+
};
|
|
24
|
+
|
|
25
|
+
const artifacts: Artifact[] = [
|
|
26
|
+
{
|
|
27
|
+
id: "a1",
|
|
28
|
+
type: "text",
|
|
29
|
+
raw: async () => Buffer.from(""),
|
|
30
|
+
contents: [{ text: "abcdefgh" }],
|
|
31
|
+
},
|
|
32
|
+
{
|
|
33
|
+
id: "a2",
|
|
34
|
+
type: "text",
|
|
35
|
+
raw: async () => Buffer.from(""),
|
|
36
|
+
contents: [{ text: "abcdefgh" }],
|
|
37
|
+
},
|
|
38
|
+
];
|
|
39
|
+
|
|
40
|
+
test("ParallelAutoMergeStrategy deduplicates arrays", async () => {
|
|
41
|
+
const strategy = new ParallelAutoMergeStrategy<Output>({
|
|
42
|
+
model: {},
|
|
43
|
+
chunkSize: 2,
|
|
44
|
+
execute: (async () => {
|
|
45
|
+
return {
|
|
46
|
+
data: { items: [{ id: 1 }, { id: 1 }] },
|
|
47
|
+
usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 },
|
|
48
|
+
};
|
|
49
|
+
}) as any,
|
|
50
|
+
dedupeExecute: (async () => {
|
|
51
|
+
return {
|
|
52
|
+
data: { keys: [] },
|
|
53
|
+
usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 },
|
|
54
|
+
};
|
|
55
|
+
}) as any,
|
|
56
|
+
});
|
|
57
|
+
|
|
58
|
+
const options: ExtractionOptions<Output> = {
|
|
59
|
+
artifacts,
|
|
60
|
+
schema,
|
|
61
|
+
strategy,
|
|
62
|
+
};
|
|
63
|
+
|
|
64
|
+
const result = await strategy.run(options);
|
|
65
|
+
expect(result.data.items.length).toBe(1);
|
|
66
|
+
});
|
|
67
|
+
|
|
68
|
+
test("dedupeArrays removes duplicates from all array fields", () => {
|
|
69
|
+
const data = {
|
|
70
|
+
items: [{ id: 1 }, { id: 1 }, { id: 2 }],
|
|
71
|
+
names: ["a", "a", "b"],
|
|
72
|
+
count: 5,
|
|
73
|
+
};
|
|
74
|
+
|
|
75
|
+
const result = __testing__.dedupeArrays(data);
|
|
76
|
+
|
|
77
|
+
expect(result.items).toEqual([{ id: 1 }, { id: 2 }]);
|
|
78
|
+
expect(result.names).toEqual(["a", "b"]);
|
|
79
|
+
expect(result.count).toBe(5);
|
|
80
|
+
});
|
|
81
|
+
|
|
82
|
+
test("dedupeArrays handles non-array fields", () => {
|
|
83
|
+
const data = {
|
|
84
|
+
title: "test",
|
|
85
|
+
count: 42,
|
|
86
|
+
};
|
|
87
|
+
|
|
88
|
+
const result = __testing__.dedupeArrays(data);
|
|
89
|
+
|
|
90
|
+
expect(result).toEqual({ title: "test", count: 42 });
|
|
91
|
+
});
|
|
92
|
+
|
|
93
|
+
test("removeByPath removes item at path", () => {
|
|
94
|
+
const data = {
|
|
95
|
+
items: [{ id: 1 }, { id: 2 }, { id: 3 }],
|
|
96
|
+
};
|
|
97
|
+
|
|
98
|
+
const result = __testing__.removeByPath(data, "items.1");
|
|
99
|
+
|
|
100
|
+
expect(result.items).toEqual([{ id: 1 }, { id: 3 }]);
|
|
101
|
+
});
|
|
102
|
+
|
|
103
|
+
test("removeByPath handles first item", () => {
|
|
104
|
+
const data = {
|
|
105
|
+
items: [{ id: 1 }, { id: 2 }],
|
|
106
|
+
};
|
|
107
|
+
|
|
108
|
+
const result = __testing__.removeByPath(data, "items.0");
|
|
109
|
+
|
|
110
|
+
expect(result.items).toEqual([{ id: 2 }]);
|
|
111
|
+
});
|
|
112
|
+
|
|
113
|
+
test("removeByPath handles last item", () => {
|
|
114
|
+
const data = {
|
|
115
|
+
items: [{ id: 1 }, { id: 2 }],
|
|
116
|
+
};
|
|
117
|
+
|
|
118
|
+
const result = __testing__.removeByPath(data, "items.1");
|
|
119
|
+
|
|
120
|
+
expect(result.items).toEqual([{ id: 1 }]);
|
|
121
|
+
});
|
|
122
|
+
|
|
123
|
+
test("removeByPath returns unchanged data for invalid path", () => {
|
|
124
|
+
const data = {
|
|
125
|
+
items: [{ id: 1 }],
|
|
126
|
+
};
|
|
127
|
+
|
|
128
|
+
expect(__testing__.removeByPath(data, "")).toEqual(data);
|
|
129
|
+
expect(__testing__.removeByPath(data, "items")).toEqual(data);
|
|
130
|
+
expect(__testing__.removeByPath(data, "items.abc")).toEqual(data);
|
|
131
|
+
expect(__testing__.removeByPath(data, "missing.0")).toEqual(data);
|
|
132
|
+
});
|
|
133
|
+
|
|
134
|
+
test("removeByPath returns unchanged data for non-array field", () => {
|
|
135
|
+
const data = {
|
|
136
|
+
title: "test",
|
|
137
|
+
};
|
|
138
|
+
|
|
139
|
+
const result = __testing__.removeByPath(data, "title.0");
|
|
140
|
+
|
|
141
|
+
expect(result).toEqual(data);
|
|
142
|
+
});
|
|
143
|
+
|
|
144
|
+
test("removeByPath does not mutate original data", () => {
|
|
145
|
+
const data = {
|
|
146
|
+
items: [{ id: 1 }, { id: 2 }],
|
|
147
|
+
};
|
|
148
|
+
|
|
149
|
+
__testing__.removeByPath(data, "items.0");
|
|
150
|
+
|
|
151
|
+
expect(data.items).toEqual([{ id: 1 }, { id: 2 }]);
|
|
152
|
+
});
|