@struktur/sdk 2.1.2 → 2.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +4111 -0
- package/dist/index.js.map +1 -0
- package/dist/parsers.js +492 -0
- package/dist/parsers.js.map +1 -0
- package/dist/strategies.js +2435 -0
- package/dist/strategies.js.map +1 -0
- package/package.json +24 -12
- package/src/agent-cli-integration.test.ts +0 -47
- package/src/agent-export.test.ts +0 -17
- package/src/agent-tool-labels.test.ts +0 -50
- package/src/artifacts/AGENTS.md +0 -16
- package/src/artifacts/fileToArtifact.test.ts +0 -37
- package/src/artifacts/fileToArtifact.ts +0 -44
- package/src/artifacts/input.test.ts +0 -243
- package/src/artifacts/input.ts +0 -360
- package/src/artifacts/providers.test.ts +0 -19
- package/src/artifacts/providers.ts +0 -7
- package/src/artifacts/urlToArtifact.test.ts +0 -23
- package/src/artifacts/urlToArtifact.ts +0 -19
- package/src/auth/AGENTS.md +0 -11
- package/src/auth/config.test.ts +0 -132
- package/src/auth/config.ts +0 -186
- package/src/auth/tokens.test.ts +0 -58
- package/src/auth/tokens.ts +0 -229
- package/src/chunking/AGENTS.md +0 -11
- package/src/chunking/ArtifactBatcher.test.ts +0 -22
- package/src/chunking/ArtifactBatcher.ts +0 -110
- package/src/chunking/ArtifactSplitter.test.ts +0 -38
- package/src/chunking/ArtifactSplitter.ts +0 -151
- package/src/debug/AGENTS.md +0 -79
- package/src/debug/logger.test.ts +0 -244
- package/src/debug/logger.ts +0 -211
- package/src/extract.test.ts +0 -22
- package/src/extract.ts +0 -150
- package/src/fields.test.ts +0 -681
- package/src/fields.ts +0 -246
- package/src/index.test.ts +0 -20
- package/src/index.ts +0 -110
- package/src/llm/AGENTS.md +0 -9
- package/src/llm/LLMClient.test.ts +0 -394
- package/src/llm/LLMClient.ts +0 -264
- package/src/llm/RetryingRunner.test.ts +0 -174
- package/src/llm/RetryingRunner.ts +0 -270
- package/src/llm/message.test.ts +0 -42
- package/src/llm/message.ts +0 -47
- package/src/llm/models.test.ts +0 -82
- package/src/llm/models.ts +0 -190
- package/src/llm/resolveModel.ts +0 -86
- package/src/merge/AGENTS.md +0 -6
- package/src/merge/Deduplicator.test.ts +0 -108
- package/src/merge/Deduplicator.ts +0 -45
- package/src/merge/SmartDataMerger.test.ts +0 -177
- package/src/merge/SmartDataMerger.ts +0 -56
- package/src/parsers/AGENTS.md +0 -58
- package/src/parsers/collect.test.ts +0 -56
- package/src/parsers/collect.ts +0 -31
- package/src/parsers/index.ts +0 -6
- package/src/parsers/mime.test.ts +0 -91
- package/src/parsers/mime.ts +0 -137
- package/src/parsers/npm.ts +0 -26
- package/src/parsers/pdf.test.ts +0 -394
- package/src/parsers/pdf.ts +0 -194
- package/src/parsers/runner.test.ts +0 -95
- package/src/parsers/runner.ts +0 -177
- package/src/parsers/types.ts +0 -29
- package/src/prompts/AGENTS.md +0 -8
- package/src/prompts/DeduplicationPrompt.test.ts +0 -41
- package/src/prompts/DeduplicationPrompt.ts +0 -37
- package/src/prompts/ExtractorPrompt.test.ts +0 -21
- package/src/prompts/ExtractorPrompt.ts +0 -72
- package/src/prompts/ParallelMergerPrompt.test.ts +0 -8
- package/src/prompts/ParallelMergerPrompt.ts +0 -37
- package/src/prompts/SequentialExtractorPrompt.test.ts +0 -24
- package/src/prompts/SequentialExtractorPrompt.ts +0 -82
- package/src/prompts/formatArtifacts.test.ts +0 -39
- package/src/prompts/formatArtifacts.ts +0 -46
- package/src/strategies/AGENTS.md +0 -6
- package/src/strategies/DoublePassAutoMergeStrategy.test.ts +0 -53
- package/src/strategies/DoublePassAutoMergeStrategy.ts +0 -410
- package/src/strategies/DoublePassStrategy.test.ts +0 -48
- package/src/strategies/DoublePassStrategy.ts +0 -266
- package/src/strategies/ParallelAutoMergeStrategy.test.ts +0 -152
- package/src/strategies/ParallelAutoMergeStrategy.ts +0 -345
- package/src/strategies/ParallelStrategy.test.ts +0 -61
- package/src/strategies/ParallelStrategy.ts +0 -208
- package/src/strategies/SequentialAutoMergeStrategy.test.ts +0 -66
- package/src/strategies/SequentialAutoMergeStrategy.ts +0 -325
- package/src/strategies/SequentialStrategy.test.ts +0 -53
- package/src/strategies/SequentialStrategy.ts +0 -142
- package/src/strategies/SimpleStrategy.test.ts +0 -46
- package/src/strategies/SimpleStrategy.ts +0 -94
- package/src/strategies/concurrency.test.ts +0 -16
- package/src/strategies/concurrency.ts +0 -14
- package/src/strategies/index.test.ts +0 -20
- package/src/strategies/index.ts +0 -7
- package/src/strategies/utils.test.ts +0 -76
- package/src/strategies/utils.ts +0 -95
- package/src/tokenization.test.ts +0 -119
- package/src/tokenization.ts +0 -71
- package/src/types.test.ts +0 -25
- package/src/types.ts +0 -174
- package/src/validation/AGENTS.md +0 -7
- package/src/validation/validator.test.ts +0 -204
- package/src/validation/validator.ts +0 -90
- package/tsconfig.json +0 -22
|
@@ -1,53 +0,0 @@
|
|
|
1
|
-
import { test, expect } from "bun:test";
|
|
2
|
-
import type { JSONSchemaType } from "ajv";
|
|
3
|
-
import { DoublePassAutoMergeStrategy } from "./DoublePassAutoMergeStrategy";
|
|
4
|
-
import type { Artifact, ExtractionOptions } from "../types";
|
|
5
|
-
|
|
6
|
-
type Output = { title: string };
|
|
7
|
-
|
|
8
|
-
const schema: JSONSchemaType<Output> = {
|
|
9
|
-
type: "object",
|
|
10
|
-
properties: { title: { type: "string" } },
|
|
11
|
-
required: ["title"],
|
|
12
|
-
additionalProperties: false,
|
|
13
|
-
};
|
|
14
|
-
|
|
15
|
-
const artifacts: Artifact[] = [
|
|
16
|
-
{
|
|
17
|
-
id: "a1",
|
|
18
|
-
type: "text",
|
|
19
|
-
raw: async () => Buffer.from(""),
|
|
20
|
-
contents: [{ text: "abcdefgh" }],
|
|
21
|
-
},
|
|
22
|
-
];
|
|
23
|
-
|
|
24
|
-
test("DoublePassAutoMergeStrategy runs both passes", async () => {
|
|
25
|
-
let calls = 0;
|
|
26
|
-
const strategy = new DoublePassAutoMergeStrategy<Output>({
|
|
27
|
-
model: {},
|
|
28
|
-
chunkSize: 10,
|
|
29
|
-
execute: (async () => {
|
|
30
|
-
calls += 1;
|
|
31
|
-
return {
|
|
32
|
-
data: { title: `pass-${calls}` },
|
|
33
|
-
usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 },
|
|
34
|
-
};
|
|
35
|
-
}) as any,
|
|
36
|
-
dedupeExecute: (async () => {
|
|
37
|
-
return {
|
|
38
|
-
data: { keys: [] },
|
|
39
|
-
usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 },
|
|
40
|
-
};
|
|
41
|
-
}) as any,
|
|
42
|
-
});
|
|
43
|
-
|
|
44
|
-
const options: ExtractionOptions<Output> = {
|
|
45
|
-
artifacts,
|
|
46
|
-
schema,
|
|
47
|
-
strategy,
|
|
48
|
-
};
|
|
49
|
-
|
|
50
|
-
const result = await strategy.run(options);
|
|
51
|
-
expect(result.data.title).toBe("pass-2");
|
|
52
|
-
expect(calls).toBe(2);
|
|
53
|
-
});
|
|
@@ -1,410 +0,0 @@
|
|
|
1
|
-
import type { ExtractionResult, ExtractionStrategy } from "../types";
|
|
2
|
-
import type { ExtractionOptions } from "../types";
|
|
3
|
-
import { buildExtractorPrompt } from "../prompts/ExtractorPrompt";
|
|
4
|
-
import { buildDeduplicationPrompt } from "../prompts/DeduplicationPrompt";
|
|
5
|
-
import { buildSequentialPrompt } from "../prompts/SequentialExtractorPrompt";
|
|
6
|
-
import {
|
|
7
|
-
extractWithPrompt,
|
|
8
|
-
getBatches,
|
|
9
|
-
mergeUsage,
|
|
10
|
-
serializeSchema,
|
|
11
|
-
} from "./utils";
|
|
12
|
-
import { SmartDataMerger } from "../merge/SmartDataMerger";
|
|
13
|
-
import {
|
|
14
|
-
findExactDuplicatesWithHashing,
|
|
15
|
-
deduplicateByIndices,
|
|
16
|
-
} from "../merge/Deduplicator";
|
|
17
|
-
import { runConcurrently } from "./concurrency";
|
|
18
|
-
import { runWithRetries } from "../llm/RetryingRunner";
|
|
19
|
-
|
|
20
|
-
export type DoublePassAutoMergeStrategyConfig = {
|
|
21
|
-
model: unknown;
|
|
22
|
-
chunkSize: number;
|
|
23
|
-
concurrency?: number;
|
|
24
|
-
maxImages?: number;
|
|
25
|
-
outputInstructions?: string;
|
|
26
|
-
dedupeModel?: unknown;
|
|
27
|
-
execute?: typeof runWithRetries;
|
|
28
|
-
dedupeExecute?: typeof runWithRetries;
|
|
29
|
-
strict?: boolean;
|
|
30
|
-
};
|
|
31
|
-
|
|
32
|
-
const dedupeSchema = {
|
|
33
|
-
type: "object",
|
|
34
|
-
properties: {
|
|
35
|
-
keys: { type: "array", items: { type: "string" } },
|
|
36
|
-
},
|
|
37
|
-
required: ["keys"],
|
|
38
|
-
additionalProperties: false,
|
|
39
|
-
} as const;
|
|
40
|
-
|
|
41
|
-
const dedupeArrays = (data: Record<string, unknown>) => {
|
|
42
|
-
const result: Record<string, unknown> = { ...data };
|
|
43
|
-
for (const [key, value] of Object.entries(result)) {
|
|
44
|
-
if (Array.isArray(value)) {
|
|
45
|
-
const duplicates = findExactDuplicatesWithHashing(value);
|
|
46
|
-
result[key] = deduplicateByIndices(value, duplicates);
|
|
47
|
-
}
|
|
48
|
-
}
|
|
49
|
-
return result;
|
|
50
|
-
};
|
|
51
|
-
|
|
52
|
-
const removeByPath = (data: Record<string, unknown>, path: string) => {
|
|
53
|
-
const [root, indexStr] = path.split(".");
|
|
54
|
-
const index = Number(indexStr);
|
|
55
|
-
if (!root || Number.isNaN(index)) {
|
|
56
|
-
return data;
|
|
57
|
-
}
|
|
58
|
-
|
|
59
|
-
const value = data[root];
|
|
60
|
-
if (!Array.isArray(value)) {
|
|
61
|
-
return data;
|
|
62
|
-
}
|
|
63
|
-
|
|
64
|
-
const next = [...value];
|
|
65
|
-
next.splice(index, 1);
|
|
66
|
-
return { ...data, [root]: next };
|
|
67
|
-
};
|
|
68
|
-
|
|
69
|
-
export class DoublePassAutoMergeStrategy<T> implements ExtractionStrategy<T> {
|
|
70
|
-
public name = "double-pass-auto-merge";
|
|
71
|
-
private config: DoublePassAutoMergeStrategyConfig;
|
|
72
|
-
|
|
73
|
-
constructor(config: DoublePassAutoMergeStrategyConfig) {
|
|
74
|
-
this.config = config;
|
|
75
|
-
}
|
|
76
|
-
|
|
77
|
-
getEstimatedSteps(artifacts: ExtractionOptions<T>["artifacts"]): number {
|
|
78
|
-
const batches = getBatches(artifacts, {
|
|
79
|
-
maxTokens: this.config.chunkSize,
|
|
80
|
-
maxImages: this.config.maxImages,
|
|
81
|
-
});
|
|
82
|
-
return batches.length * 2 + 3;
|
|
83
|
-
}
|
|
84
|
-
|
|
85
|
-
async run(options: ExtractionOptions<T>): Promise<ExtractionResult<T>> {
|
|
86
|
-
const debug = options.debug;
|
|
87
|
-
const { telemetry } = options;
|
|
88
|
-
|
|
89
|
-
// Create strategy-level span
|
|
90
|
-
const strategySpan = telemetry?.startSpan({
|
|
91
|
-
name: "strategy.double-pass-auto-merge",
|
|
92
|
-
kind: "CHAIN",
|
|
93
|
-
attributes: {
|
|
94
|
-
"strategy.name": this.name,
|
|
95
|
-
"strategy.artifacts.count": options.artifacts.length,
|
|
96
|
-
"strategy.chunk_size": this.config.chunkSize,
|
|
97
|
-
"strategy.concurrency": this.config.concurrency,
|
|
98
|
-
},
|
|
99
|
-
});
|
|
100
|
-
|
|
101
|
-
const batches = getBatches(
|
|
102
|
-
options.artifacts,
|
|
103
|
-
{
|
|
104
|
-
maxTokens: this.config.chunkSize,
|
|
105
|
-
maxImages: this.config.maxImages,
|
|
106
|
-
},
|
|
107
|
-
debug,
|
|
108
|
-
telemetry ?? undefined,
|
|
109
|
-
strategySpan,
|
|
110
|
-
);
|
|
111
|
-
|
|
112
|
-
const schema = serializeSchema(options.schema);
|
|
113
|
-
const totalSteps = this.getEstimatedSteps(options.artifacts);
|
|
114
|
-
let step = 1;
|
|
115
|
-
|
|
116
|
-
// Create pass 1 span
|
|
117
|
-
const pass1Span = telemetry?.startSpan({
|
|
118
|
-
name: "struktur.pass_1",
|
|
119
|
-
kind: "CHAIN",
|
|
120
|
-
parentSpan: strategySpan,
|
|
121
|
-
attributes: {
|
|
122
|
-
"pass.number": 1,
|
|
123
|
-
"pass.type": "parallel_extraction",
|
|
124
|
-
},
|
|
125
|
-
});
|
|
126
|
-
|
|
127
|
-
const tasks = batches.map((batch, index) => async () => {
|
|
128
|
-
const prompt = buildExtractorPrompt(
|
|
129
|
-
batch,
|
|
130
|
-
schema,
|
|
131
|
-
this.config.outputInstructions,
|
|
132
|
-
);
|
|
133
|
-
const result = await extractWithPrompt<T>({
|
|
134
|
-
model: this.config.model,
|
|
135
|
-
schema: options.schema,
|
|
136
|
-
system: prompt.system,
|
|
137
|
-
user: prompt.user,
|
|
138
|
-
artifacts: batch,
|
|
139
|
-
events: options.events,
|
|
140
|
-
execute: this.config.execute as never,
|
|
141
|
-
strict: options.strict ?? this.config.strict,
|
|
142
|
-
debug,
|
|
143
|
-
callId: `double_pass_auto_1_batch_${index + 1}`,
|
|
144
|
-
telemetry: telemetry ?? undefined,
|
|
145
|
-
parentSpan: pass1Span,
|
|
146
|
-
});
|
|
147
|
-
step += 1;
|
|
148
|
-
await options.events?.onStep?.({
|
|
149
|
-
step,
|
|
150
|
-
total: totalSteps,
|
|
151
|
-
label: `pass 1 batch ${index + 1}/${batches.length}`,
|
|
152
|
-
});
|
|
153
|
-
debug?.step({
|
|
154
|
-
step,
|
|
155
|
-
total: totalSteps,
|
|
156
|
-
label: `pass 1 batch ${index + 1}/${batches.length}`,
|
|
157
|
-
strategy: this.name,
|
|
158
|
-
});
|
|
159
|
-
return result;
|
|
160
|
-
});
|
|
161
|
-
|
|
162
|
-
const results = await runConcurrently(
|
|
163
|
-
tasks,
|
|
164
|
-
this.config.concurrency ?? batches.length,
|
|
165
|
-
);
|
|
166
|
-
|
|
167
|
-
const merger = new SmartDataMerger(
|
|
168
|
-
options.schema as Record<string, unknown>,
|
|
169
|
-
);
|
|
170
|
-
let merged = {} as Record<string, unknown>;
|
|
171
|
-
|
|
172
|
-
debug?.mergeStart({
|
|
173
|
-
mergeId: "double_pass_auto_merge",
|
|
174
|
-
inputCount: results.length,
|
|
175
|
-
strategy: this.name,
|
|
176
|
-
});
|
|
177
|
-
|
|
178
|
-
// Create smart merge span
|
|
179
|
-
const mergeSpan = telemetry?.startSpan({
|
|
180
|
-
name: "struktur.smart_merge",
|
|
181
|
-
kind: "CHAIN",
|
|
182
|
-
parentSpan: pass1Span,
|
|
183
|
-
attributes: {
|
|
184
|
-
"merge.strategy": "smart",
|
|
185
|
-
"merge.input_count": results.length,
|
|
186
|
-
},
|
|
187
|
-
});
|
|
188
|
-
|
|
189
|
-
for (let i = 0; i < results.length; i++) {
|
|
190
|
-
const result = results[i]!;
|
|
191
|
-
merged = merger.merge(merged, result.data as Record<string, unknown>);
|
|
192
|
-
|
|
193
|
-
// Log merge operation per field
|
|
194
|
-
for (const key of Object.keys(result.data as Record<string, unknown>)) {
|
|
195
|
-
const leftArray = Array.isArray(merged[key])
|
|
196
|
-
? (merged[key] as unknown[]).length
|
|
197
|
-
: undefined;
|
|
198
|
-
const rightArray = Array.isArray(
|
|
199
|
-
(result.data as Record<string, unknown>)[key],
|
|
200
|
-
)
|
|
201
|
-
? ((result.data as Record<string, unknown>)[key] as unknown[]).length
|
|
202
|
-
: undefined;
|
|
203
|
-
|
|
204
|
-
debug?.smartMergeField({
|
|
205
|
-
mergeId: "double_pass_auto_merge",
|
|
206
|
-
field: key,
|
|
207
|
-
operation: "merge_arrays",
|
|
208
|
-
leftCount: leftArray,
|
|
209
|
-
rightCount: rightArray,
|
|
210
|
-
});
|
|
211
|
-
|
|
212
|
-
// Record merge event in telemetry
|
|
213
|
-
if (mergeSpan && telemetry) {
|
|
214
|
-
telemetry.recordEvent(mergeSpan, {
|
|
215
|
-
type: "merge",
|
|
216
|
-
strategy: "smart",
|
|
217
|
-
inputCount: rightArray ?? 1,
|
|
218
|
-
outputCount: leftArray ?? 1,
|
|
219
|
-
});
|
|
220
|
-
}
|
|
221
|
-
}
|
|
222
|
-
}
|
|
223
|
-
|
|
224
|
-
debug?.mergeComplete({ mergeId: "double_pass_auto_merge", success: true });
|
|
225
|
-
|
|
226
|
-
// End merge span
|
|
227
|
-
if (mergeSpan && telemetry) {
|
|
228
|
-
telemetry.endSpan(mergeSpan, {
|
|
229
|
-
status: "ok",
|
|
230
|
-
output: merged,
|
|
231
|
-
});
|
|
232
|
-
}
|
|
233
|
-
|
|
234
|
-
merged = dedupeArrays(merged);
|
|
235
|
-
|
|
236
|
-
// Create exact dedupe span
|
|
237
|
-
const exactDedupeSpan = telemetry?.startSpan({
|
|
238
|
-
name: "struktur.exact_dedupe",
|
|
239
|
-
kind: "CHAIN",
|
|
240
|
-
parentSpan: pass1Span,
|
|
241
|
-
attributes: {
|
|
242
|
-
"dedupe.method": "exact_hashing",
|
|
243
|
-
},
|
|
244
|
-
});
|
|
245
|
-
|
|
246
|
-
// End exact dedupe span
|
|
247
|
-
if (exactDedupeSpan && telemetry) {
|
|
248
|
-
telemetry.recordEvent(exactDedupeSpan, {
|
|
249
|
-
type: "merge",
|
|
250
|
-
strategy: "exact_hash_dedupe",
|
|
251
|
-
inputCount: Object.keys(merged).length,
|
|
252
|
-
outputCount: Object.keys(merged).length,
|
|
253
|
-
});
|
|
254
|
-
telemetry.endSpan(exactDedupeSpan, {
|
|
255
|
-
status: "ok",
|
|
256
|
-
output: merged,
|
|
257
|
-
});
|
|
258
|
-
}
|
|
259
|
-
|
|
260
|
-
const dedupePrompt = buildDeduplicationPrompt(schema, merged);
|
|
261
|
-
|
|
262
|
-
debug?.dedupeStart({
|
|
263
|
-
dedupeId: "double_pass_auto_dedupe",
|
|
264
|
-
itemCount: Object.keys(merged).length,
|
|
265
|
-
});
|
|
266
|
-
|
|
267
|
-
// Create LLM dedupe span
|
|
268
|
-
const llmDedupeSpan = telemetry?.startSpan({
|
|
269
|
-
name: "struktur.llm_dedupe",
|
|
270
|
-
kind: "CHAIN",
|
|
271
|
-
parentSpan: pass1Span,
|
|
272
|
-
attributes: {
|
|
273
|
-
"dedupe.method": "llm",
|
|
274
|
-
},
|
|
275
|
-
});
|
|
276
|
-
|
|
277
|
-
const dedupeResponse = await runWithRetries<{ keys: string[] }>({
|
|
278
|
-
model: this.config.dedupeModel ?? this.config.model,
|
|
279
|
-
schema: dedupeSchema,
|
|
280
|
-
system: dedupePrompt.system,
|
|
281
|
-
user: dedupePrompt.user,
|
|
282
|
-
events: options.events,
|
|
283
|
-
execute: this.config.dedupeExecute,
|
|
284
|
-
strict: this.config.strict,
|
|
285
|
-
debug,
|
|
286
|
-
callId: "double_pass_auto_dedupe",
|
|
287
|
-
telemetry: telemetry ?? undefined,
|
|
288
|
-
parentSpan: llmDedupeSpan,
|
|
289
|
-
});
|
|
290
|
-
|
|
291
|
-
step += 1;
|
|
292
|
-
await options.events?.onStep?.({
|
|
293
|
-
step,
|
|
294
|
-
total: totalSteps,
|
|
295
|
-
label: "pass 1 dedupe",
|
|
296
|
-
});
|
|
297
|
-
debug?.step({
|
|
298
|
-
step,
|
|
299
|
-
total: totalSteps,
|
|
300
|
-
label: "pass 1 dedupe",
|
|
301
|
-
strategy: this.name,
|
|
302
|
-
});
|
|
303
|
-
|
|
304
|
-
let deduped = merged;
|
|
305
|
-
for (const key of dedupeResponse.data.keys) {
|
|
306
|
-
deduped = removeByPath(deduped, key);
|
|
307
|
-
}
|
|
308
|
-
|
|
309
|
-
debug?.dedupeComplete({
|
|
310
|
-
dedupeId: "double_pass_auto_dedupe",
|
|
311
|
-
duplicatesFound: dedupeResponse.data.keys.length,
|
|
312
|
-
itemsRemoved: dedupeResponse.data.keys.length,
|
|
313
|
-
});
|
|
314
|
-
|
|
315
|
-
// End LLM dedupe span
|
|
316
|
-
if (llmDedupeSpan && telemetry) {
|
|
317
|
-
telemetry.recordEvent(llmDedupeSpan, {
|
|
318
|
-
type: "merge",
|
|
319
|
-
strategy: "llm_dedupe",
|
|
320
|
-
inputCount: Object.keys(merged).length,
|
|
321
|
-
outputCount: Object.keys(deduped).length,
|
|
322
|
-
deduped: dedupeResponse.data.keys.length,
|
|
323
|
-
});
|
|
324
|
-
telemetry.endSpan(llmDedupeSpan, {
|
|
325
|
-
status: "ok",
|
|
326
|
-
output: deduped,
|
|
327
|
-
});
|
|
328
|
-
}
|
|
329
|
-
|
|
330
|
-
// End pass 1 span
|
|
331
|
-
telemetry?.endSpan(pass1Span!, {
|
|
332
|
-
status: "ok",
|
|
333
|
-
output: deduped,
|
|
334
|
-
});
|
|
335
|
-
|
|
336
|
-
let currentData = deduped as T;
|
|
337
|
-
const usages = [...results.map((r) => r.usage), dedupeResponse.usage];
|
|
338
|
-
|
|
339
|
-
// Create pass 2 span
|
|
340
|
-
const pass2Span = telemetry?.startSpan({
|
|
341
|
-
name: "struktur.pass_2",
|
|
342
|
-
kind: "CHAIN",
|
|
343
|
-
parentSpan: strategySpan,
|
|
344
|
-
attributes: {
|
|
345
|
-
"pass.number": 2,
|
|
346
|
-
"pass.type": "sequential_refinement",
|
|
347
|
-
},
|
|
348
|
-
});
|
|
349
|
-
|
|
350
|
-
for (const [index, batch] of batches.entries()) {
|
|
351
|
-
const prompt = buildSequentialPrompt(
|
|
352
|
-
batch,
|
|
353
|
-
schema,
|
|
354
|
-
JSON.stringify(currentData),
|
|
355
|
-
this.config.outputInstructions,
|
|
356
|
-
);
|
|
357
|
-
|
|
358
|
-
const result = await extractWithPrompt<T>({
|
|
359
|
-
model: this.config.model,
|
|
360
|
-
schema: options.schema,
|
|
361
|
-
system: prompt.system,
|
|
362
|
-
user: prompt.user,
|
|
363
|
-
artifacts: batch,
|
|
364
|
-
events: options.events,
|
|
365
|
-
execute: this.config.execute as never,
|
|
366
|
-
strict: this.config.strict,
|
|
367
|
-
debug,
|
|
368
|
-
callId: `double_pass_auto_2_batch_${index + 1}`,
|
|
369
|
-
telemetry: telemetry ?? undefined,
|
|
370
|
-
parentSpan: pass2Span,
|
|
371
|
-
});
|
|
372
|
-
|
|
373
|
-
currentData = result.data;
|
|
374
|
-
usages.push(result.usage);
|
|
375
|
-
|
|
376
|
-
step += 1;
|
|
377
|
-
await options.events?.onStep?.({
|
|
378
|
-
step,
|
|
379
|
-
total: totalSteps,
|
|
380
|
-
label: `pass 2 batch ${index + 1}/${batches.length}`,
|
|
381
|
-
});
|
|
382
|
-
debug?.step({
|
|
383
|
-
step,
|
|
384
|
-
total: totalSteps,
|
|
385
|
-
label: `pass 2 batch ${index + 1}/${batches.length}`,
|
|
386
|
-
strategy: this.name,
|
|
387
|
-
});
|
|
388
|
-
}
|
|
389
|
-
|
|
390
|
-
// End pass 2 span
|
|
391
|
-
telemetry?.endSpan(pass2Span!, {
|
|
392
|
-
status: "ok",
|
|
393
|
-
output: currentData,
|
|
394
|
-
});
|
|
395
|
-
|
|
396
|
-
// End strategy span
|
|
397
|
-
telemetry?.endSpan(strategySpan!, {
|
|
398
|
-
status: "ok",
|
|
399
|
-
output: currentData,
|
|
400
|
-
});
|
|
401
|
-
|
|
402
|
-
return { data: currentData, usage: mergeUsage(usages) };
|
|
403
|
-
}
|
|
404
|
-
}
|
|
405
|
-
|
|
406
|
-
export const doublePassAutoMerge = <T>(
|
|
407
|
-
config: DoublePassAutoMergeStrategyConfig,
|
|
408
|
-
) => {
|
|
409
|
-
return new DoublePassAutoMergeStrategy<T>(config);
|
|
410
|
-
};
|
|
@@ -1,48 +0,0 @@
|
|
|
1
|
-
import { test, expect } from "bun:test";
|
|
2
|
-
import type { JSONSchemaType } from "ajv";
|
|
3
|
-
import { DoublePassStrategy } from "./DoublePassStrategy";
|
|
4
|
-
import type { Artifact, ExtractionOptions } from "../types";
|
|
5
|
-
|
|
6
|
-
type Output = { title: string };
|
|
7
|
-
|
|
8
|
-
const schema: JSONSchemaType<Output> = {
|
|
9
|
-
type: "object",
|
|
10
|
-
properties: { title: { type: "string" } },
|
|
11
|
-
required: ["title"],
|
|
12
|
-
additionalProperties: false,
|
|
13
|
-
};
|
|
14
|
-
|
|
15
|
-
const artifacts: Artifact[] = [
|
|
16
|
-
{
|
|
17
|
-
id: "a1",
|
|
18
|
-
type: "text",
|
|
19
|
-
raw: async () => Buffer.from(""),
|
|
20
|
-
contents: [{ text: "abcdefgh" }],
|
|
21
|
-
},
|
|
22
|
-
];
|
|
23
|
-
|
|
24
|
-
test("DoublePassStrategy runs second pass", async () => {
|
|
25
|
-
let calls = 0;
|
|
26
|
-
const strategy = new DoublePassStrategy<Output>({
|
|
27
|
-
model: {},
|
|
28
|
-
mergeModel: {},
|
|
29
|
-
chunkSize: 10,
|
|
30
|
-
execute: (async () => {
|
|
31
|
-
calls += 1;
|
|
32
|
-
return {
|
|
33
|
-
data: { title: `pass-${calls}` },
|
|
34
|
-
usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 },
|
|
35
|
-
};
|
|
36
|
-
}) as any,
|
|
37
|
-
});
|
|
38
|
-
|
|
39
|
-
const options: ExtractionOptions<Output> = {
|
|
40
|
-
artifacts,
|
|
41
|
-
schema,
|
|
42
|
-
strategy,
|
|
43
|
-
};
|
|
44
|
-
|
|
45
|
-
const result = await strategy.run(options);
|
|
46
|
-
expect(result.data.title).toBe("pass-3");
|
|
47
|
-
expect(calls).toBe(3);
|
|
48
|
-
});
|