@struktur/sdk 2.1.2 → 2.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +4111 -0
- package/dist/index.js.map +1 -0
- package/dist/parsers.js +492 -0
- package/dist/parsers.js.map +1 -0
- package/dist/strategies.js +2435 -0
- package/dist/strategies.js.map +1 -0
- package/package.json +24 -12
- package/src/agent-cli-integration.test.ts +0 -47
- package/src/agent-export.test.ts +0 -17
- package/src/agent-tool-labels.test.ts +0 -50
- package/src/artifacts/AGENTS.md +0 -16
- package/src/artifacts/fileToArtifact.test.ts +0 -37
- package/src/artifacts/fileToArtifact.ts +0 -44
- package/src/artifacts/input.test.ts +0 -243
- package/src/artifacts/input.ts +0 -360
- package/src/artifacts/providers.test.ts +0 -19
- package/src/artifacts/providers.ts +0 -7
- package/src/artifacts/urlToArtifact.test.ts +0 -23
- package/src/artifacts/urlToArtifact.ts +0 -19
- package/src/auth/AGENTS.md +0 -11
- package/src/auth/config.test.ts +0 -132
- package/src/auth/config.ts +0 -186
- package/src/auth/tokens.test.ts +0 -58
- package/src/auth/tokens.ts +0 -229
- package/src/chunking/AGENTS.md +0 -11
- package/src/chunking/ArtifactBatcher.test.ts +0 -22
- package/src/chunking/ArtifactBatcher.ts +0 -110
- package/src/chunking/ArtifactSplitter.test.ts +0 -38
- package/src/chunking/ArtifactSplitter.ts +0 -151
- package/src/debug/AGENTS.md +0 -79
- package/src/debug/logger.test.ts +0 -244
- package/src/debug/logger.ts +0 -211
- package/src/extract.test.ts +0 -22
- package/src/extract.ts +0 -150
- package/src/fields.test.ts +0 -681
- package/src/fields.ts +0 -246
- package/src/index.test.ts +0 -20
- package/src/index.ts +0 -110
- package/src/llm/AGENTS.md +0 -9
- package/src/llm/LLMClient.test.ts +0 -394
- package/src/llm/LLMClient.ts +0 -264
- package/src/llm/RetryingRunner.test.ts +0 -174
- package/src/llm/RetryingRunner.ts +0 -270
- package/src/llm/message.test.ts +0 -42
- package/src/llm/message.ts +0 -47
- package/src/llm/models.test.ts +0 -82
- package/src/llm/models.ts +0 -190
- package/src/llm/resolveModel.ts +0 -86
- package/src/merge/AGENTS.md +0 -6
- package/src/merge/Deduplicator.test.ts +0 -108
- package/src/merge/Deduplicator.ts +0 -45
- package/src/merge/SmartDataMerger.test.ts +0 -177
- package/src/merge/SmartDataMerger.ts +0 -56
- package/src/parsers/AGENTS.md +0 -58
- package/src/parsers/collect.test.ts +0 -56
- package/src/parsers/collect.ts +0 -31
- package/src/parsers/index.ts +0 -6
- package/src/parsers/mime.test.ts +0 -91
- package/src/parsers/mime.ts +0 -137
- package/src/parsers/npm.ts +0 -26
- package/src/parsers/pdf.test.ts +0 -394
- package/src/parsers/pdf.ts +0 -194
- package/src/parsers/runner.test.ts +0 -95
- package/src/parsers/runner.ts +0 -177
- package/src/parsers/types.ts +0 -29
- package/src/prompts/AGENTS.md +0 -8
- package/src/prompts/DeduplicationPrompt.test.ts +0 -41
- package/src/prompts/DeduplicationPrompt.ts +0 -37
- package/src/prompts/ExtractorPrompt.test.ts +0 -21
- package/src/prompts/ExtractorPrompt.ts +0 -72
- package/src/prompts/ParallelMergerPrompt.test.ts +0 -8
- package/src/prompts/ParallelMergerPrompt.ts +0 -37
- package/src/prompts/SequentialExtractorPrompt.test.ts +0 -24
- package/src/prompts/SequentialExtractorPrompt.ts +0 -82
- package/src/prompts/formatArtifacts.test.ts +0 -39
- package/src/prompts/formatArtifacts.ts +0 -46
- package/src/strategies/AGENTS.md +0 -6
- package/src/strategies/DoublePassAutoMergeStrategy.test.ts +0 -53
- package/src/strategies/DoublePassAutoMergeStrategy.ts +0 -410
- package/src/strategies/DoublePassStrategy.test.ts +0 -48
- package/src/strategies/DoublePassStrategy.ts +0 -266
- package/src/strategies/ParallelAutoMergeStrategy.test.ts +0 -152
- package/src/strategies/ParallelAutoMergeStrategy.ts +0 -345
- package/src/strategies/ParallelStrategy.test.ts +0 -61
- package/src/strategies/ParallelStrategy.ts +0 -208
- package/src/strategies/SequentialAutoMergeStrategy.test.ts +0 -66
- package/src/strategies/SequentialAutoMergeStrategy.ts +0 -325
- package/src/strategies/SequentialStrategy.test.ts +0 -53
- package/src/strategies/SequentialStrategy.ts +0 -142
- package/src/strategies/SimpleStrategy.test.ts +0 -46
- package/src/strategies/SimpleStrategy.ts +0 -94
- package/src/strategies/concurrency.test.ts +0 -16
- package/src/strategies/concurrency.ts +0 -14
- package/src/strategies/index.test.ts +0 -20
- package/src/strategies/index.ts +0 -7
- package/src/strategies/utils.test.ts +0 -76
- package/src/strategies/utils.ts +0 -95
- package/src/tokenization.test.ts +0 -119
- package/src/tokenization.ts +0 -71
- package/src/types.test.ts +0 -25
- package/src/types.ts +0 -174
- package/src/validation/AGENTS.md +0 -7
- package/src/validation/validator.test.ts +0 -204
- package/src/validation/validator.ts +0 -90
- package/tsconfig.json +0 -22
|
@@ -1,325 +0,0 @@
|
|
|
1
|
-
import type { ExtractionResult, ExtractionStrategy } from "../types";
|
|
2
|
-
import type { ExtractionOptions } from "../types";
|
|
3
|
-
import { buildExtractorPrompt } from "../prompts/ExtractorPrompt";
|
|
4
|
-
import { buildDeduplicationPrompt } from "../prompts/DeduplicationPrompt";
|
|
5
|
-
import {
|
|
6
|
-
extractWithPrompt,
|
|
7
|
-
getBatches,
|
|
8
|
-
mergeUsage,
|
|
9
|
-
serializeSchema,
|
|
10
|
-
} from "./utils";
|
|
11
|
-
import { SmartDataMerger } from "../merge/SmartDataMerger";
|
|
12
|
-
import {
|
|
13
|
-
findExactDuplicatesWithHashing,
|
|
14
|
-
deduplicateByIndices,
|
|
15
|
-
} from "../merge/Deduplicator";
|
|
16
|
-
import { runWithRetries } from "../llm/RetryingRunner";
|
|
17
|
-
|
|
18
|
-
export type SequentialAutoMergeStrategyConfig = {
|
|
19
|
-
model: unknown;
|
|
20
|
-
chunkSize: number;
|
|
21
|
-
maxImages?: number;
|
|
22
|
-
outputInstructions?: string;
|
|
23
|
-
dedupeModel?: unknown;
|
|
24
|
-
execute?: typeof runWithRetries;
|
|
25
|
-
dedupeExecute?: typeof runWithRetries;
|
|
26
|
-
strict?: boolean;
|
|
27
|
-
};
|
|
28
|
-
|
|
29
|
-
const dedupeSchema = {
|
|
30
|
-
type: "object",
|
|
31
|
-
properties: {
|
|
32
|
-
keys: { type: "array", items: { type: "string" } },
|
|
33
|
-
},
|
|
34
|
-
required: ["keys"],
|
|
35
|
-
additionalProperties: false,
|
|
36
|
-
} as const;
|
|
37
|
-
|
|
38
|
-
const dedupeArrays = (data: Record<string, unknown>) => {
|
|
39
|
-
const result: Record<string, unknown> = { ...data };
|
|
40
|
-
for (const [key, value] of Object.entries(result)) {
|
|
41
|
-
if (Array.isArray(value)) {
|
|
42
|
-
const duplicates = findExactDuplicatesWithHashing(value);
|
|
43
|
-
result[key] = deduplicateByIndices(value, duplicates);
|
|
44
|
-
}
|
|
45
|
-
}
|
|
46
|
-
return result;
|
|
47
|
-
};
|
|
48
|
-
|
|
49
|
-
const removeByPath = (data: Record<string, unknown>, path: string) => {
|
|
50
|
-
const [root, indexStr] = path.split(".");
|
|
51
|
-
const index = Number(indexStr);
|
|
52
|
-
if (!root || Number.isNaN(index)) {
|
|
53
|
-
return data;
|
|
54
|
-
}
|
|
55
|
-
|
|
56
|
-
const value = data[root];
|
|
57
|
-
if (!Array.isArray(value)) {
|
|
58
|
-
return data;
|
|
59
|
-
}
|
|
60
|
-
|
|
61
|
-
const next = [...value];
|
|
62
|
-
next.splice(index, 1);
|
|
63
|
-
return { ...data, [root]: next };
|
|
64
|
-
};
|
|
65
|
-
|
|
66
|
-
export class SequentialAutoMergeStrategy<T> implements ExtractionStrategy<T> {
|
|
67
|
-
public name = "sequential-auto-merge";
|
|
68
|
-
private config: SequentialAutoMergeStrategyConfig;
|
|
69
|
-
|
|
70
|
-
constructor(config: SequentialAutoMergeStrategyConfig) {
|
|
71
|
-
this.config = config;
|
|
72
|
-
}
|
|
73
|
-
|
|
74
|
-
getEstimatedSteps(artifacts: ExtractionOptions<T>["artifacts"]): number {
|
|
75
|
-
const batches = getBatches(artifacts, {
|
|
76
|
-
maxTokens: this.config.chunkSize,
|
|
77
|
-
maxImages: this.config.maxImages,
|
|
78
|
-
});
|
|
79
|
-
return batches.length + 3;
|
|
80
|
-
}
|
|
81
|
-
|
|
82
|
-
async run(options: ExtractionOptions<T>): Promise<ExtractionResult<T>> {
|
|
83
|
-
const debug = options.debug;
|
|
84
|
-
const { telemetry } = options;
|
|
85
|
-
|
|
86
|
-
// Create strategy-level span
|
|
87
|
-
const strategySpan = telemetry?.startSpan({
|
|
88
|
-
name: "strategy.sequential-auto-merge",
|
|
89
|
-
kind: "CHAIN",
|
|
90
|
-
attributes: {
|
|
91
|
-
"strategy.name": this.name,
|
|
92
|
-
"strategy.artifacts.count": options.artifacts.length,
|
|
93
|
-
"strategy.chunk_size": this.config.chunkSize,
|
|
94
|
-
},
|
|
95
|
-
});
|
|
96
|
-
|
|
97
|
-
const batches = getBatches(
|
|
98
|
-
options.artifacts,
|
|
99
|
-
{
|
|
100
|
-
maxTokens: this.config.chunkSize,
|
|
101
|
-
maxImages: this.config.maxImages,
|
|
102
|
-
},
|
|
103
|
-
debug,
|
|
104
|
-
telemetry ?? undefined,
|
|
105
|
-
strategySpan,
|
|
106
|
-
);
|
|
107
|
-
|
|
108
|
-
const schema = serializeSchema(options.schema);
|
|
109
|
-
const merger = new SmartDataMerger(
|
|
110
|
-
options.schema as Record<string, unknown>,
|
|
111
|
-
);
|
|
112
|
-
let merged = {} as Record<string, unknown>;
|
|
113
|
-
const usages = [];
|
|
114
|
-
const totalSteps = this.getEstimatedSteps(options.artifacts);
|
|
115
|
-
let step = 1;
|
|
116
|
-
|
|
117
|
-
debug?.mergeStart({
|
|
118
|
-
mergeId: "sequential_auto_merge",
|
|
119
|
-
inputCount: batches.length,
|
|
120
|
-
strategy: this.name,
|
|
121
|
-
});
|
|
122
|
-
|
|
123
|
-
// Create smart merge span
|
|
124
|
-
const mergeSpan = telemetry?.startSpan({
|
|
125
|
-
name: "struktur.smart_merge",
|
|
126
|
-
kind: "CHAIN",
|
|
127
|
-
parentSpan: strategySpan,
|
|
128
|
-
attributes: {
|
|
129
|
-
"merge.strategy": "smart",
|
|
130
|
-
"merge.input_count": batches.length,
|
|
131
|
-
},
|
|
132
|
-
});
|
|
133
|
-
|
|
134
|
-
for (const [index, batch] of batches.entries()) {
|
|
135
|
-
const prompt = buildExtractorPrompt(
|
|
136
|
-
batch,
|
|
137
|
-
schema,
|
|
138
|
-
this.config.outputInstructions,
|
|
139
|
-
);
|
|
140
|
-
const result = await extractWithPrompt<T>({
|
|
141
|
-
model: this.config.model,
|
|
142
|
-
schema: options.schema,
|
|
143
|
-
system: prompt.system,
|
|
144
|
-
user: prompt.user,
|
|
145
|
-
artifacts: batch,
|
|
146
|
-
events: options.events,
|
|
147
|
-
execute: this.config.execute as never,
|
|
148
|
-
strict: options.strict ?? this.config.strict,
|
|
149
|
-
debug,
|
|
150
|
-
callId: `sequential_auto_batch_${index + 1}`,
|
|
151
|
-
telemetry: telemetry ?? undefined,
|
|
152
|
-
parentSpan: mergeSpan,
|
|
153
|
-
});
|
|
154
|
-
|
|
155
|
-
merged = merger.merge(merged, result.data as Record<string, unknown>);
|
|
156
|
-
usages.push(result.usage);
|
|
157
|
-
|
|
158
|
-
// Log merge operation per field
|
|
159
|
-
for (const key of Object.keys(result.data as Record<string, unknown>)) {
|
|
160
|
-
const leftArray = Array.isArray(merged[key])
|
|
161
|
-
? (merged[key] as unknown[]).length
|
|
162
|
-
: undefined;
|
|
163
|
-
const rightArray = Array.isArray(
|
|
164
|
-
(result.data as Record<string, unknown>)[key],
|
|
165
|
-
)
|
|
166
|
-
? ((result.data as Record<string, unknown>)[key] as unknown[]).length
|
|
167
|
-
: undefined;
|
|
168
|
-
|
|
169
|
-
debug?.smartMergeField({
|
|
170
|
-
mergeId: "sequential_auto_merge",
|
|
171
|
-
field: key,
|
|
172
|
-
operation: "merge_arrays",
|
|
173
|
-
leftCount: leftArray,
|
|
174
|
-
rightCount: rightArray,
|
|
175
|
-
});
|
|
176
|
-
|
|
177
|
-
// Record merge event in telemetry
|
|
178
|
-
if (mergeSpan && telemetry) {
|
|
179
|
-
telemetry.recordEvent(mergeSpan, {
|
|
180
|
-
type: "merge",
|
|
181
|
-
strategy: "smart",
|
|
182
|
-
inputCount: rightArray ?? 1,
|
|
183
|
-
outputCount: leftArray ?? 1,
|
|
184
|
-
});
|
|
185
|
-
}
|
|
186
|
-
}
|
|
187
|
-
|
|
188
|
-
step += 1;
|
|
189
|
-
await options.events?.onStep?.({
|
|
190
|
-
step,
|
|
191
|
-
total: totalSteps,
|
|
192
|
-
label: `batch ${index + 1}/${batches.length}`,
|
|
193
|
-
});
|
|
194
|
-
debug?.step({
|
|
195
|
-
step,
|
|
196
|
-
total: totalSteps,
|
|
197
|
-
label: `batch ${index + 1}/${batches.length}`,
|
|
198
|
-
strategy: this.name,
|
|
199
|
-
});
|
|
200
|
-
}
|
|
201
|
-
|
|
202
|
-
debug?.mergeComplete({ mergeId: "sequential_auto_merge", success: true });
|
|
203
|
-
|
|
204
|
-
// End merge span
|
|
205
|
-
if (mergeSpan && telemetry) {
|
|
206
|
-
telemetry.endSpan(mergeSpan, {
|
|
207
|
-
status: "ok",
|
|
208
|
-
output: merged,
|
|
209
|
-
});
|
|
210
|
-
}
|
|
211
|
-
|
|
212
|
-
merged = dedupeArrays(merged);
|
|
213
|
-
|
|
214
|
-
// Create exact dedupe span
|
|
215
|
-
const exactDedupeSpan = telemetry?.startSpan({
|
|
216
|
-
name: "struktur.exact_dedupe",
|
|
217
|
-
kind: "CHAIN",
|
|
218
|
-
parentSpan: strategySpan,
|
|
219
|
-
attributes: {
|
|
220
|
-
"dedupe.method": "exact_hashing",
|
|
221
|
-
},
|
|
222
|
-
});
|
|
223
|
-
|
|
224
|
-
// End exact dedupe span
|
|
225
|
-
if (exactDedupeSpan && telemetry) {
|
|
226
|
-
telemetry.recordEvent(exactDedupeSpan, {
|
|
227
|
-
type: "merge",
|
|
228
|
-
strategy: "exact_hash_dedupe",
|
|
229
|
-
inputCount: Object.keys(merged).length,
|
|
230
|
-
outputCount: Object.keys(merged).length,
|
|
231
|
-
});
|
|
232
|
-
telemetry.endSpan(exactDedupeSpan, {
|
|
233
|
-
status: "ok",
|
|
234
|
-
output: merged,
|
|
235
|
-
});
|
|
236
|
-
}
|
|
237
|
-
|
|
238
|
-
const dedupePrompt = buildDeduplicationPrompt(schema, merged);
|
|
239
|
-
|
|
240
|
-
debug?.dedupeStart({
|
|
241
|
-
dedupeId: "sequential_auto_dedupe",
|
|
242
|
-
itemCount: Object.keys(merged).length,
|
|
243
|
-
});
|
|
244
|
-
|
|
245
|
-
// Create LLM dedupe span
|
|
246
|
-
const llmDedupeSpan = telemetry?.startSpan({
|
|
247
|
-
name: "struktur.llm_dedupe",
|
|
248
|
-
kind: "CHAIN",
|
|
249
|
-
parentSpan: strategySpan,
|
|
250
|
-
attributes: {
|
|
251
|
-
"dedupe.method": "llm",
|
|
252
|
-
},
|
|
253
|
-
});
|
|
254
|
-
|
|
255
|
-
const dedupeResponse = await runWithRetries<{ keys: string[] }>({
|
|
256
|
-
model: this.config.dedupeModel ?? this.config.model,
|
|
257
|
-
schema: dedupeSchema,
|
|
258
|
-
system: dedupePrompt.system,
|
|
259
|
-
user: dedupePrompt.user,
|
|
260
|
-
events: options.events,
|
|
261
|
-
execute: this.config.dedupeExecute,
|
|
262
|
-
strict: this.config.strict,
|
|
263
|
-
debug,
|
|
264
|
-
callId: "sequential_auto_dedupe",
|
|
265
|
-
telemetry: telemetry ?? undefined,
|
|
266
|
-
parentSpan: llmDedupeSpan,
|
|
267
|
-
});
|
|
268
|
-
|
|
269
|
-
step += 1;
|
|
270
|
-
await options.events?.onStep?.({
|
|
271
|
-
step,
|
|
272
|
-
total: totalSteps,
|
|
273
|
-
label: "dedupe",
|
|
274
|
-
});
|
|
275
|
-
debug?.step({
|
|
276
|
-
step,
|
|
277
|
-
total: totalSteps,
|
|
278
|
-
label: "dedupe",
|
|
279
|
-
strategy: this.name,
|
|
280
|
-
});
|
|
281
|
-
|
|
282
|
-
let deduped = merged;
|
|
283
|
-
for (const key of dedupeResponse.data.keys) {
|
|
284
|
-
deduped = removeByPath(deduped, key);
|
|
285
|
-
}
|
|
286
|
-
|
|
287
|
-
debug?.dedupeComplete({
|
|
288
|
-
dedupeId: "sequential_auto_dedupe",
|
|
289
|
-
duplicatesFound: dedupeResponse.data.keys.length,
|
|
290
|
-
itemsRemoved: dedupeResponse.data.keys.length,
|
|
291
|
-
});
|
|
292
|
-
|
|
293
|
-
// End LLM dedupe span
|
|
294
|
-
if (llmDedupeSpan && telemetry) {
|
|
295
|
-
telemetry.recordEvent(llmDedupeSpan, {
|
|
296
|
-
type: "merge",
|
|
297
|
-
strategy: "llm_dedupe",
|
|
298
|
-
inputCount: Object.keys(merged).length,
|
|
299
|
-
outputCount: Object.keys(deduped).length,
|
|
300
|
-
deduped: dedupeResponse.data.keys.length,
|
|
301
|
-
});
|
|
302
|
-
telemetry.endSpan(llmDedupeSpan, {
|
|
303
|
-
status: "ok",
|
|
304
|
-
output: deduped,
|
|
305
|
-
});
|
|
306
|
-
}
|
|
307
|
-
|
|
308
|
-
// End strategy span
|
|
309
|
-
telemetry?.endSpan(strategySpan!, {
|
|
310
|
-
status: "ok",
|
|
311
|
-
output: deduped,
|
|
312
|
-
});
|
|
313
|
-
|
|
314
|
-
return {
|
|
315
|
-
data: deduped as T,
|
|
316
|
-
usage: mergeUsage([...usages, dedupeResponse.usage]),
|
|
317
|
-
};
|
|
318
|
-
}
|
|
319
|
-
}
|
|
320
|
-
|
|
321
|
-
export const sequentialAutoMerge = <T>(
|
|
322
|
-
config: SequentialAutoMergeStrategyConfig,
|
|
323
|
-
) => {
|
|
324
|
-
return new SequentialAutoMergeStrategy<T>(config);
|
|
325
|
-
};
|
|
@@ -1,53 +0,0 @@
|
|
|
1
|
-
import { test, expect } from "bun:test";
|
|
2
|
-
import type { JSONSchemaType } from "ajv";
|
|
3
|
-
import { SequentialStrategy } from "./SequentialStrategy";
|
|
4
|
-
import type { Artifact, ExtractionOptions } from "../types";
|
|
5
|
-
|
|
6
|
-
type Output = { title: string };
|
|
7
|
-
|
|
8
|
-
const schema: JSONSchemaType<Output> = {
|
|
9
|
-
type: "object",
|
|
10
|
-
properties: { title: { type: "string" } },
|
|
11
|
-
required: ["title"],
|
|
12
|
-
additionalProperties: false,
|
|
13
|
-
};
|
|
14
|
-
|
|
15
|
-
const artifacts: Artifact[] = [
|
|
16
|
-
{
|
|
17
|
-
id: "a1",
|
|
18
|
-
type: "text",
|
|
19
|
-
raw: async () => Buffer.from(""),
|
|
20
|
-
contents: [{ text: "abcdefgh" }],
|
|
21
|
-
},
|
|
22
|
-
{
|
|
23
|
-
id: "a2",
|
|
24
|
-
type: "text",
|
|
25
|
-
raw: async () => Buffer.from(""),
|
|
26
|
-
contents: [{ text: "abcdefgh" }],
|
|
27
|
-
},
|
|
28
|
-
];
|
|
29
|
-
|
|
30
|
-
test("SequentialStrategy processes batches in order", async () => {
|
|
31
|
-
let calls = 0;
|
|
32
|
-
const strategy = new SequentialStrategy<Output>({
|
|
33
|
-
model: {},
|
|
34
|
-
chunkSize: 2,
|
|
35
|
-
execute: (async () => {
|
|
36
|
-
calls += 1;
|
|
37
|
-
return {
|
|
38
|
-
data: { title: `step-${calls}` },
|
|
39
|
-
usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 },
|
|
40
|
-
};
|
|
41
|
-
}) as any,
|
|
42
|
-
});
|
|
43
|
-
|
|
44
|
-
const options: ExtractionOptions<Output> = {
|
|
45
|
-
artifacts,
|
|
46
|
-
schema,
|
|
47
|
-
strategy,
|
|
48
|
-
};
|
|
49
|
-
|
|
50
|
-
const result = await strategy.run(options);
|
|
51
|
-
expect(result.data.title).toBe("step-2");
|
|
52
|
-
expect(calls).toBe(2);
|
|
53
|
-
});
|
|
@@ -1,142 +0,0 @@
|
|
|
1
|
-
import type { ExtractionResult, ExtractionStrategy } from "../types";
|
|
2
|
-
import type { ExtractionOptions } from "../types";
|
|
3
|
-
import { buildSequentialPrompt } from "../prompts/SequentialExtractorPrompt";
|
|
4
|
-
import {
|
|
5
|
-
extractWithPrompt,
|
|
6
|
-
getBatches,
|
|
7
|
-
mergeUsage,
|
|
8
|
-
serializeSchema,
|
|
9
|
-
} from "./utils";
|
|
10
|
-
import { runWithRetries } from "../llm/RetryingRunner";
|
|
11
|
-
|
|
12
|
-
export type SequentialStrategyConfig = {
|
|
13
|
-
model: unknown;
|
|
14
|
-
chunkSize: number;
|
|
15
|
-
maxImages?: number;
|
|
16
|
-
outputInstructions?: string;
|
|
17
|
-
execute?: typeof runWithRetries;
|
|
18
|
-
strict?: boolean;
|
|
19
|
-
};
|
|
20
|
-
|
|
21
|
-
export class SequentialStrategy<T> implements ExtractionStrategy<T> {
|
|
22
|
-
public name = "sequential";
|
|
23
|
-
private config: SequentialStrategyConfig;
|
|
24
|
-
|
|
25
|
-
constructor(config: SequentialStrategyConfig) {
|
|
26
|
-
this.config = config;
|
|
27
|
-
}
|
|
28
|
-
|
|
29
|
-
getEstimatedSteps(artifacts: ExtractionOptions<T>["artifacts"]): number {
|
|
30
|
-
const batches = getBatches(artifacts, {
|
|
31
|
-
maxTokens: this.config.chunkSize,
|
|
32
|
-
maxImages: this.config.maxImages,
|
|
33
|
-
});
|
|
34
|
-
return batches.length + 2;
|
|
35
|
-
}
|
|
36
|
-
|
|
37
|
-
async run(options: ExtractionOptions<T>): Promise<ExtractionResult<T>> {
|
|
38
|
-
const debug = options.debug;
|
|
39
|
-
const { telemetry } = options;
|
|
40
|
-
|
|
41
|
-
// Create strategy-level span
|
|
42
|
-
const strategySpan = telemetry?.startSpan({
|
|
43
|
-
name: "strategy.sequential",
|
|
44
|
-
kind: "CHAIN",
|
|
45
|
-
attributes: {
|
|
46
|
-
"strategy.name": this.name,
|
|
47
|
-
"strategy.artifacts.count": options.artifacts.length,
|
|
48
|
-
"strategy.chunk_size": this.config.chunkSize,
|
|
49
|
-
},
|
|
50
|
-
});
|
|
51
|
-
|
|
52
|
-
const batches = getBatches(
|
|
53
|
-
options.artifacts,
|
|
54
|
-
{
|
|
55
|
-
maxTokens: this.config.chunkSize,
|
|
56
|
-
maxImages: this.config.maxImages,
|
|
57
|
-
},
|
|
58
|
-
debug,
|
|
59
|
-
telemetry ?? undefined,
|
|
60
|
-
strategySpan,
|
|
61
|
-
);
|
|
62
|
-
|
|
63
|
-
const schema = serializeSchema(options.schema);
|
|
64
|
-
let currentData: T | undefined;
|
|
65
|
-
const usages = [];
|
|
66
|
-
const totalSteps = this.getEstimatedSteps(options.artifacts);
|
|
67
|
-
let step = 1;
|
|
68
|
-
|
|
69
|
-
// Emit start event
|
|
70
|
-
await options.events?.onStep?.({
|
|
71
|
-
step,
|
|
72
|
-
total: totalSteps,
|
|
73
|
-
label: batches.length > 1 ? `batch 1/${batches.length}` : "extract",
|
|
74
|
-
});
|
|
75
|
-
debug?.step({
|
|
76
|
-
step,
|
|
77
|
-
total: totalSteps,
|
|
78
|
-
label: batches.length > 1 ? `batch 1/${batches.length}` : "extract",
|
|
79
|
-
strategy: this.name,
|
|
80
|
-
});
|
|
81
|
-
|
|
82
|
-
for (const [index, batch] of batches.entries()) {
|
|
83
|
-
const previousData = currentData ? JSON.stringify(currentData) : "{}";
|
|
84
|
-
const prompt = buildSequentialPrompt(
|
|
85
|
-
batch,
|
|
86
|
-
schema,
|
|
87
|
-
previousData,
|
|
88
|
-
this.config.outputInstructions,
|
|
89
|
-
);
|
|
90
|
-
|
|
91
|
-
const result = await extractWithPrompt<T>({
|
|
92
|
-
model: this.config.model,
|
|
93
|
-
schema: options.schema,
|
|
94
|
-
system: prompt.system,
|
|
95
|
-
user: prompt.user,
|
|
96
|
-
artifacts: batch,
|
|
97
|
-
events: options.events,
|
|
98
|
-
execute: this.config.execute as never,
|
|
99
|
-
strict: options.strict ?? this.config.strict,
|
|
100
|
-
debug,
|
|
101
|
-
callId: `sequential_batch_${index + 1}`,
|
|
102
|
-
telemetry: telemetry ?? undefined,
|
|
103
|
-
parentSpan: strategySpan,
|
|
104
|
-
});
|
|
105
|
-
|
|
106
|
-
currentData = result.data;
|
|
107
|
-
usages.push(result.usage);
|
|
108
|
-
|
|
109
|
-
step += 1;
|
|
110
|
-
// Only emit progress if there are more batches
|
|
111
|
-
if (index < batches.length - 1) {
|
|
112
|
-
await options.events?.onStep?.({
|
|
113
|
-
step,
|
|
114
|
-
total: totalSteps,
|
|
115
|
-
label: `batch ${index + 2}/${batches.length}`,
|
|
116
|
-
});
|
|
117
|
-
debug?.step({
|
|
118
|
-
step,
|
|
119
|
-
total: totalSteps,
|
|
120
|
-
label: `batch ${index + 2}/${batches.length}`,
|
|
121
|
-
strategy: this.name,
|
|
122
|
-
});
|
|
123
|
-
}
|
|
124
|
-
}
|
|
125
|
-
|
|
126
|
-
if (!currentData) {
|
|
127
|
-
throw new Error("No data extracted from sequential strategy");
|
|
128
|
-
}
|
|
129
|
-
|
|
130
|
-
// End strategy span
|
|
131
|
-
telemetry?.endSpan(strategySpan!, {
|
|
132
|
-
status: "ok",
|
|
133
|
-
output: currentData,
|
|
134
|
-
});
|
|
135
|
-
|
|
136
|
-
return { data: currentData, usage: mergeUsage(usages) };
|
|
137
|
-
}
|
|
138
|
-
}
|
|
139
|
-
|
|
140
|
-
export const sequential = <T>(config: SequentialStrategyConfig) => {
|
|
141
|
-
return new SequentialStrategy<T>(config);
|
|
142
|
-
};
|
|
@@ -1,46 +0,0 @@
|
|
|
1
|
-
import { test, expect } from "bun:test";
|
|
2
|
-
import type { JSONSchemaType } from "ajv";
|
|
3
|
-
import { SimpleStrategy } from "./SimpleStrategy";
|
|
4
|
-
import type { Artifact, ExtractionOptions } from "../types";
|
|
5
|
-
|
|
6
|
-
type Output = { title: string };
|
|
7
|
-
|
|
8
|
-
const schema: JSONSchemaType<Output> = {
|
|
9
|
-
type: "object",
|
|
10
|
-
properties: { title: { type: "string" } },
|
|
11
|
-
required: ["title"],
|
|
12
|
-
additionalProperties: false,
|
|
13
|
-
};
|
|
14
|
-
|
|
15
|
-
const artifacts: Artifact[] = [
|
|
16
|
-
{
|
|
17
|
-
id: "a1",
|
|
18
|
-
type: "text",
|
|
19
|
-
raw: async () => Buffer.from(""),
|
|
20
|
-
contents: [{ text: "hello" }],
|
|
21
|
-
},
|
|
22
|
-
];
|
|
23
|
-
|
|
24
|
-
test("SimpleStrategy runs once", async () => {
|
|
25
|
-
let calls = 0;
|
|
26
|
-
const strategy = new SimpleStrategy<Output>({
|
|
27
|
-
model: {},
|
|
28
|
-
execute: (async () => {
|
|
29
|
-
calls += 1;
|
|
30
|
-
return {
|
|
31
|
-
data: { title: "ok" },
|
|
32
|
-
usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 },
|
|
33
|
-
};
|
|
34
|
-
}) as any,
|
|
35
|
-
});
|
|
36
|
-
|
|
37
|
-
const options: ExtractionOptions<Output> = {
|
|
38
|
-
artifacts,
|
|
39
|
-
schema,
|
|
40
|
-
strategy,
|
|
41
|
-
};
|
|
42
|
-
|
|
43
|
-
const result = await strategy.run(options);
|
|
44
|
-
expect(result.data.title).toBe("ok");
|
|
45
|
-
expect(calls).toBe(1);
|
|
46
|
-
});
|
|
@@ -1,94 +0,0 @@
|
|
|
1
|
-
import type { ExtractionResult, ExtractionStrategy } from "../types";
|
|
2
|
-
import type { ExtractionOptions } from "../types";
|
|
3
|
-
import { buildExtractorPrompt } from "../prompts/ExtractorPrompt";
|
|
4
|
-
import { extractWithPrompt, serializeSchema } from "./utils";
|
|
5
|
-
import { runWithRetries } from "../llm/RetryingRunner";
|
|
6
|
-
|
|
7
|
-
export type SimpleStrategyConfig = {
|
|
8
|
-
model: unknown;
|
|
9
|
-
outputInstructions?: string;
|
|
10
|
-
execute?: typeof runWithRetries;
|
|
11
|
-
strict?: boolean;
|
|
12
|
-
};
|
|
13
|
-
|
|
14
|
-
export class SimpleStrategy<T> implements ExtractionStrategy<T> {
|
|
15
|
-
public name = "simple";
|
|
16
|
-
private config: SimpleStrategyConfig;
|
|
17
|
-
|
|
18
|
-
constructor(config: SimpleStrategyConfig) {
|
|
19
|
-
this.config = config;
|
|
20
|
-
}
|
|
21
|
-
|
|
22
|
-
getEstimatedSteps(): number {
|
|
23
|
-
return 3;
|
|
24
|
-
}
|
|
25
|
-
|
|
26
|
-
async run(options: ExtractionOptions<T>): Promise<ExtractionResult<T>> {
|
|
27
|
-
const debug = options.debug;
|
|
28
|
-
const { telemetry } = options;
|
|
29
|
-
|
|
30
|
-
// Create strategy-level span
|
|
31
|
-
const strategySpan = telemetry?.startSpan({
|
|
32
|
-
name: "strategy.simple",
|
|
33
|
-
kind: "CHAIN",
|
|
34
|
-
attributes: {
|
|
35
|
-
"strategy.name": this.name,
|
|
36
|
-
"strategy.artifacts.count": options.artifacts.length,
|
|
37
|
-
},
|
|
38
|
-
});
|
|
39
|
-
|
|
40
|
-
const schema = serializeSchema(options.schema);
|
|
41
|
-
const { system, user } = buildExtractorPrompt(
|
|
42
|
-
options.artifacts,
|
|
43
|
-
schema,
|
|
44
|
-
this.config.outputInstructions,
|
|
45
|
-
);
|
|
46
|
-
|
|
47
|
-
// Emit start event before extraction begins
|
|
48
|
-
await options.events?.onStep?.({
|
|
49
|
-
step: 1,
|
|
50
|
-
total: this.getEstimatedSteps(),
|
|
51
|
-
label: "extract",
|
|
52
|
-
});
|
|
53
|
-
debug?.step({
|
|
54
|
-
step: 1,
|
|
55
|
-
total: this.getEstimatedSteps(),
|
|
56
|
-
label: "extract",
|
|
57
|
-
strategy: this.name,
|
|
58
|
-
});
|
|
59
|
-
|
|
60
|
-
const result = await extractWithPrompt<T>({
|
|
61
|
-
model: this.config.model,
|
|
62
|
-
schema: options.schema,
|
|
63
|
-
system,
|
|
64
|
-
user,
|
|
65
|
-
artifacts: options.artifacts,
|
|
66
|
-
events: options.events,
|
|
67
|
-
execute: this.config.execute as never,
|
|
68
|
-
strict: options.strict ?? this.config.strict,
|
|
69
|
-
debug,
|
|
70
|
-
callId: "simple_extract",
|
|
71
|
-
telemetry,
|
|
72
|
-
parentSpan: strategySpan,
|
|
73
|
-
});
|
|
74
|
-
|
|
75
|
-
debug?.step({
|
|
76
|
-
step: 2,
|
|
77
|
-
total: this.getEstimatedSteps(),
|
|
78
|
-
label: "complete",
|
|
79
|
-
strategy: this.name,
|
|
80
|
-
});
|
|
81
|
-
|
|
82
|
-
// End strategy span
|
|
83
|
-
telemetry?.endSpan(strategySpan!, {
|
|
84
|
-
status: "ok",
|
|
85
|
-
output: result.data,
|
|
86
|
-
});
|
|
87
|
-
|
|
88
|
-
return { data: result.data, usage: result.usage };
|
|
89
|
-
}
|
|
90
|
-
}
|
|
91
|
-
|
|
92
|
-
export const simple = <T>(config: SimpleStrategyConfig) => {
|
|
93
|
-
return new SimpleStrategy<T>(config);
|
|
94
|
-
};
|
|
@@ -1,16 +0,0 @@
|
|
|
1
|
-
import { test, expect } from "bun:test";
|
|
2
|
-
import { runConcurrently } from "./concurrency";
|
|
3
|
-
|
|
4
|
-
test("runConcurrently runs tasks in batches", async () => {
|
|
5
|
-
const started: number[] = [];
|
|
6
|
-
const tasks = [1, 2, 3, 4, 5].map((value) => async () => {
|
|
7
|
-
started.push(value);
|
|
8
|
-
await new Promise((resolve) => setTimeout(resolve, 5));
|
|
9
|
-
return value;
|
|
10
|
-
});
|
|
11
|
-
|
|
12
|
-
const results = await runConcurrently(tasks, 2);
|
|
13
|
-
|
|
14
|
-
expect(results).toEqual([1, 2, 3, 4, 5]);
|
|
15
|
-
expect(started).toEqual([1, 2, 3, 4, 5]);
|
|
16
|
-
});
|
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
export const runConcurrently = async <T>(
|
|
2
|
-
tasks: Array<() => Promise<T>>,
|
|
3
|
-
concurrency: number
|
|
4
|
-
): Promise<T[]> => {
|
|
5
|
-
const results: T[] = [];
|
|
6
|
-
|
|
7
|
-
for (let i = 0; i < tasks.length; i += concurrency) {
|
|
8
|
-
const chunk = tasks.slice(i, i + concurrency).map((task) => task());
|
|
9
|
-
const chunkResults = await Promise.all(chunk);
|
|
10
|
-
results.push(...chunkResults);
|
|
11
|
-
}
|
|
12
|
-
|
|
13
|
-
return results;
|
|
14
|
-
};
|