@struktur/sdk 2.1.2 → 2.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/artifacts/fileToArtifact.d.ts +8 -0
- package/dist/artifacts/fileToArtifact.d.ts.map +1 -0
- package/dist/artifacts/input.d.ts +60 -0
- package/dist/artifacts/input.d.ts.map +1 -0
- package/{src/artifacts/providers.ts → dist/artifacts/providers.d.ts} +2 -4
- package/dist/artifacts/providers.d.ts.map +1 -0
- package/dist/artifacts/urlToArtifact.d.ts +3 -0
- package/dist/artifacts/urlToArtifact.d.ts.map +1 -0
- package/dist/auth/config.d.ts +34 -0
- package/dist/auth/config.d.ts.map +1 -0
- package/dist/auth/tokens.d.ts +18 -0
- package/dist/auth/tokens.d.ts.map +1 -0
- package/dist/chunking/ArtifactBatcher.d.ts +11 -0
- package/dist/chunking/ArtifactBatcher.d.ts.map +1 -0
- package/dist/chunking/ArtifactSplitter.d.ts +10 -0
- package/dist/chunking/ArtifactSplitter.d.ts.map +1 -0
- package/dist/debug/logger.d.ts +169 -0
- package/dist/debug/logger.d.ts.map +1 -0
- package/dist/extract.d.ts +3 -0
- package/dist/extract.d.ts.map +1 -0
- package/dist/fields.d.ts +75 -0
- package/dist/fields.d.ts.map +1 -0
- package/dist/index.d.ts +24 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +5603 -0
- package/dist/index.js.map +1 -0
- package/dist/llm/LLMClient.d.ts +40 -0
- package/dist/llm/LLMClient.d.ts.map +1 -0
- package/dist/llm/RetryingRunner.d.ts +37 -0
- package/dist/llm/RetryingRunner.d.ts.map +1 -0
- package/dist/llm/message.d.ts +12 -0
- package/dist/llm/message.d.ts.map +1 -0
- package/dist/llm/models.d.ts +13 -0
- package/dist/llm/models.d.ts.map +1 -0
- package/dist/llm/resolveModel.d.ts +3 -0
- package/dist/llm/resolveModel.d.ts.map +1 -0
- package/dist/merge/Deduplicator.d.ts +4 -0
- package/dist/merge/Deduplicator.d.ts.map +1 -0
- package/dist/merge/SmartDataMerger.d.ts +7 -0
- package/dist/merge/SmartDataMerger.d.ts.map +1 -0
- package/dist/parsers/collect.d.ts +7 -0
- package/dist/parsers/collect.d.ts.map +1 -0
- package/{src/parsers/index.ts → dist/parsers/index.d.ts} +1 -0
- package/dist/parsers/index.d.ts.map +1 -0
- package/dist/parsers/mime.d.ts +12 -0
- package/dist/parsers/mime.d.ts.map +1 -0
- package/dist/parsers/npm.d.ts +16 -0
- package/dist/parsers/npm.d.ts.map +1 -0
- package/dist/parsers/pdf.d.ts +36 -0
- package/dist/parsers/pdf.d.ts.map +1 -0
- package/dist/parsers/runner.d.ts +4 -0
- package/dist/parsers/runner.d.ts.map +1 -0
- package/dist/parsers/types.d.ts +27 -0
- package/dist/parsers/types.d.ts.map +1 -0
- package/dist/parsers.d.ts +1 -0
- package/dist/parsers.js +492 -0
- package/dist/parsers.js.map +1 -0
- package/dist/prompts/DeduplicationPrompt.d.ts +5 -0
- package/dist/prompts/DeduplicationPrompt.d.ts.map +1 -0
- package/dist/prompts/ExtractorPrompt.d.ts +6 -0
- package/dist/prompts/ExtractorPrompt.d.ts.map +1 -0
- package/dist/prompts/ParallelMergerPrompt.d.ts +5 -0
- package/dist/prompts/ParallelMergerPrompt.d.ts.map +1 -0
- package/dist/prompts/SequentialExtractorPrompt.d.ts +6 -0
- package/dist/prompts/SequentialExtractorPrompt.d.ts.map +1 -0
- package/dist/prompts/formatArtifacts.d.ts +3 -0
- package/dist/prompts/formatArtifacts.d.ts.map +1 -0
- package/dist/strategies/DoublePassAutoMergeStrategy.d.ts +23 -0
- package/dist/strategies/DoublePassAutoMergeStrategy.d.ts.map +1 -0
- package/dist/strategies/DoublePassStrategy.d.ts +22 -0
- package/dist/strategies/DoublePassStrategy.d.ts.map +1 -0
- package/dist/strategies/ParallelAutoMergeStrategy.d.ts +27 -0
- package/dist/strategies/ParallelAutoMergeStrategy.d.ts.map +1 -0
- package/dist/strategies/ParallelStrategy.d.ts +22 -0
- package/dist/strategies/ParallelStrategy.d.ts.map +1 -0
- package/dist/strategies/SequentialAutoMergeStrategy.d.ts +22 -0
- package/dist/strategies/SequentialAutoMergeStrategy.d.ts.map +1 -0
- package/dist/strategies/SequentialStrategy.d.ts +20 -0
- package/dist/strategies/SequentialStrategy.d.ts.map +1 -0
- package/dist/strategies/SimpleStrategy.d.ts +18 -0
- package/dist/strategies/SimpleStrategy.d.ts.map +1 -0
- package/dist/strategies/agent/AgentStrategy.d.ts +44 -0
- package/dist/strategies/agent/AgentStrategy.d.ts.map +1 -0
- package/dist/strategies/agent/AgentTools.d.ts +55 -0
- package/dist/strategies/agent/AgentTools.d.ts.map +1 -0
- package/dist/strategies/agent/ArtifactFilesystem.d.ts +51 -0
- package/dist/strategies/agent/ArtifactFilesystem.d.ts.map +1 -0
- package/dist/strategies/agent/index.d.ts +4 -0
- package/dist/strategies/agent/index.d.ts.map +1 -0
- package/dist/strategies/concurrency.d.ts +2 -0
- package/dist/strategies/concurrency.d.ts.map +1 -0
- package/{src/strategies/index.ts → dist/strategies/index.d.ts} +2 -0
- package/dist/strategies/index.d.ts.map +1 -0
- package/dist/strategies/utils.d.ts +39 -0
- package/dist/strategies/utils.d.ts.map +1 -0
- package/dist/strategies.d.ts +1 -0
- package/dist/strategies.js +3930 -0
- package/dist/strategies.js.map +1 -0
- package/dist/tokenization.d.ts +11 -0
- package/dist/tokenization.d.ts.map +1 -0
- package/dist/types.d.ts +178 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/validation/validator.d.ts +20 -0
- package/dist/validation/validator.d.ts.map +1 -0
- package/package.json +30 -14
- package/src/agent-cli-integration.test.ts +0 -47
- package/src/agent-export.test.ts +0 -17
- package/src/agent-tool-labels.test.ts +0 -50
- package/src/artifacts/AGENTS.md +0 -16
- package/src/artifacts/fileToArtifact.test.ts +0 -37
- package/src/artifacts/fileToArtifact.ts +0 -44
- package/src/artifacts/input.test.ts +0 -243
- package/src/artifacts/input.ts +0 -360
- package/src/artifacts/providers.test.ts +0 -19
- package/src/artifacts/urlToArtifact.test.ts +0 -23
- package/src/artifacts/urlToArtifact.ts +0 -19
- package/src/auth/AGENTS.md +0 -11
- package/src/auth/config.test.ts +0 -132
- package/src/auth/config.ts +0 -186
- package/src/auth/tokens.test.ts +0 -58
- package/src/auth/tokens.ts +0 -229
- package/src/chunking/AGENTS.md +0 -11
- package/src/chunking/ArtifactBatcher.test.ts +0 -22
- package/src/chunking/ArtifactBatcher.ts +0 -110
- package/src/chunking/ArtifactSplitter.test.ts +0 -38
- package/src/chunking/ArtifactSplitter.ts +0 -151
- package/src/debug/AGENTS.md +0 -79
- package/src/debug/logger.test.ts +0 -244
- package/src/debug/logger.ts +0 -211
- package/src/extract.test.ts +0 -22
- package/src/extract.ts +0 -150
- package/src/fields.test.ts +0 -681
- package/src/fields.ts +0 -246
- package/src/index.test.ts +0 -20
- package/src/index.ts +0 -110
- package/src/llm/AGENTS.md +0 -9
- package/src/llm/LLMClient.test.ts +0 -394
- package/src/llm/LLMClient.ts +0 -264
- package/src/llm/RetryingRunner.test.ts +0 -174
- package/src/llm/RetryingRunner.ts +0 -270
- package/src/llm/message.test.ts +0 -42
- package/src/llm/message.ts +0 -47
- package/src/llm/models.test.ts +0 -82
- package/src/llm/models.ts +0 -190
- package/src/llm/resolveModel.ts +0 -86
- package/src/merge/AGENTS.md +0 -6
- package/src/merge/Deduplicator.test.ts +0 -108
- package/src/merge/Deduplicator.ts +0 -45
- package/src/merge/SmartDataMerger.test.ts +0 -177
- package/src/merge/SmartDataMerger.ts +0 -56
- package/src/parsers/AGENTS.md +0 -58
- package/src/parsers/collect.test.ts +0 -56
- package/src/parsers/collect.ts +0 -31
- package/src/parsers/mime.test.ts +0 -91
- package/src/parsers/mime.ts +0 -137
- package/src/parsers/npm.ts +0 -26
- package/src/parsers/pdf.test.ts +0 -394
- package/src/parsers/pdf.ts +0 -194
- package/src/parsers/runner.test.ts +0 -95
- package/src/parsers/runner.ts +0 -177
- package/src/parsers/types.ts +0 -29
- package/src/prompts/AGENTS.md +0 -8
- package/src/prompts/DeduplicationPrompt.test.ts +0 -41
- package/src/prompts/DeduplicationPrompt.ts +0 -37
- package/src/prompts/ExtractorPrompt.test.ts +0 -21
- package/src/prompts/ExtractorPrompt.ts +0 -72
- package/src/prompts/ParallelMergerPrompt.test.ts +0 -8
- package/src/prompts/ParallelMergerPrompt.ts +0 -37
- package/src/prompts/SequentialExtractorPrompt.test.ts +0 -24
- package/src/prompts/SequentialExtractorPrompt.ts +0 -82
- package/src/prompts/formatArtifacts.test.ts +0 -39
- package/src/prompts/formatArtifacts.ts +0 -46
- package/src/strategies/AGENTS.md +0 -6
- package/src/strategies/DoublePassAutoMergeStrategy.test.ts +0 -53
- package/src/strategies/DoublePassAutoMergeStrategy.ts +0 -410
- package/src/strategies/DoublePassStrategy.test.ts +0 -48
- package/src/strategies/DoublePassStrategy.ts +0 -266
- package/src/strategies/ParallelAutoMergeStrategy.test.ts +0 -152
- package/src/strategies/ParallelAutoMergeStrategy.ts +0 -345
- package/src/strategies/ParallelStrategy.test.ts +0 -61
- package/src/strategies/ParallelStrategy.ts +0 -208
- package/src/strategies/SequentialAutoMergeStrategy.test.ts +0 -66
- package/src/strategies/SequentialAutoMergeStrategy.ts +0 -325
- package/src/strategies/SequentialStrategy.test.ts +0 -53
- package/src/strategies/SequentialStrategy.ts +0 -142
- package/src/strategies/SimpleStrategy.test.ts +0 -46
- package/src/strategies/SimpleStrategy.ts +0 -94
- package/src/strategies/concurrency.test.ts +0 -16
- package/src/strategies/concurrency.ts +0 -14
- package/src/strategies/index.test.ts +0 -20
- package/src/strategies/utils.test.ts +0 -76
- package/src/strategies/utils.ts +0 -95
- package/src/tokenization.test.ts +0 -119
- package/src/tokenization.ts +0 -71
- package/src/types.test.ts +0 -25
- package/src/types.ts +0 -174
- package/src/validation/AGENTS.md +0 -7
- package/src/validation/validator.test.ts +0 -204
- package/src/validation/validator.ts +0 -90
- package/tsconfig.json +0 -22
|
@@ -1,345 +0,0 @@
|
|
|
1
|
-
import type { ExtractionResult, ExtractionStrategy } from "../types";
|
|
2
|
-
import type { ExtractionOptions } from "../types";
|
|
3
|
-
import { buildExtractorPrompt } from "../prompts/ExtractorPrompt";
|
|
4
|
-
import { buildDeduplicationPrompt } from "../prompts/DeduplicationPrompt";
|
|
5
|
-
import {
|
|
6
|
-
extractWithPrompt,
|
|
7
|
-
getBatches,
|
|
8
|
-
mergeUsage,
|
|
9
|
-
serializeSchema,
|
|
10
|
-
} from "./utils";
|
|
11
|
-
import { runConcurrently } from "./concurrency";
|
|
12
|
-
import { SmartDataMerger } from "../merge/SmartDataMerger";
|
|
13
|
-
import {
|
|
14
|
-
findExactDuplicatesWithHashing,
|
|
15
|
-
deduplicateByIndices,
|
|
16
|
-
} from "../merge/Deduplicator";
|
|
17
|
-
import { runWithRetries } from "../llm/RetryingRunner";
|
|
18
|
-
|
|
19
|
-
export type ParallelAutoMergeStrategyConfig = {
|
|
20
|
-
model: unknown;
|
|
21
|
-
chunkSize: number;
|
|
22
|
-
concurrency?: number;
|
|
23
|
-
maxImages?: number;
|
|
24
|
-
outputInstructions?: string;
|
|
25
|
-
dedupeModel?: unknown;
|
|
26
|
-
execute?: typeof runWithRetries;
|
|
27
|
-
dedupeExecute?: typeof runWithRetries;
|
|
28
|
-
strict?: boolean;
|
|
29
|
-
};
|
|
30
|
-
|
|
31
|
-
const dedupeSchema = {
|
|
32
|
-
type: "object",
|
|
33
|
-
properties: {
|
|
34
|
-
keys: { type: "array", items: { type: "string" } },
|
|
35
|
-
},
|
|
36
|
-
required: ["keys"],
|
|
37
|
-
additionalProperties: false,
|
|
38
|
-
} as const;
|
|
39
|
-
|
|
40
|
-
const dedupeArrays = (data: Record<string, unknown>) => {
|
|
41
|
-
const result: Record<string, unknown> = { ...data };
|
|
42
|
-
for (const [key, value] of Object.entries(result)) {
|
|
43
|
-
if (Array.isArray(value)) {
|
|
44
|
-
const duplicates = findExactDuplicatesWithHashing(value);
|
|
45
|
-
result[key] = deduplicateByIndices(value, duplicates);
|
|
46
|
-
}
|
|
47
|
-
}
|
|
48
|
-
return result;
|
|
49
|
-
};
|
|
50
|
-
|
|
51
|
-
const removeByPath = (data: Record<string, unknown>, path: string) => {
|
|
52
|
-
const [root, indexStr] = path.split(".");
|
|
53
|
-
const index = Number(indexStr);
|
|
54
|
-
if (!root || Number.isNaN(index)) {
|
|
55
|
-
return data;
|
|
56
|
-
}
|
|
57
|
-
|
|
58
|
-
const value = data[root];
|
|
59
|
-
if (!Array.isArray(value)) {
|
|
60
|
-
return data;
|
|
61
|
-
}
|
|
62
|
-
|
|
63
|
-
const next = [...value];
|
|
64
|
-
next.splice(index, 1);
|
|
65
|
-
return { ...data, [root]: next };
|
|
66
|
-
};
|
|
67
|
-
|
|
68
|
-
export class ParallelAutoMergeStrategy<T> implements ExtractionStrategy<T> {
|
|
69
|
-
public name = "parallel-auto-merge";
|
|
70
|
-
private config: ParallelAutoMergeStrategyConfig;
|
|
71
|
-
|
|
72
|
-
constructor(config: ParallelAutoMergeStrategyConfig) {
|
|
73
|
-
this.config = config;
|
|
74
|
-
}
|
|
75
|
-
|
|
76
|
-
getEstimatedSteps(artifacts: ExtractionOptions<T>["artifacts"]): number {
|
|
77
|
-
const batches = getBatches(artifacts, {
|
|
78
|
-
maxTokens: this.config.chunkSize,
|
|
79
|
-
maxImages: this.config.maxImages,
|
|
80
|
-
});
|
|
81
|
-
return batches.length + 3;
|
|
82
|
-
}
|
|
83
|
-
|
|
84
|
-
async run(options: ExtractionOptions<T>): Promise<ExtractionResult<T>> {
|
|
85
|
-
const debug = options.debug;
|
|
86
|
-
const { telemetry } = options;
|
|
87
|
-
|
|
88
|
-
// Create strategy-level span
|
|
89
|
-
const strategySpan = telemetry?.startSpan({
|
|
90
|
-
name: "strategy.parallel-auto-merge",
|
|
91
|
-
kind: "CHAIN",
|
|
92
|
-
attributes: {
|
|
93
|
-
"strategy.name": this.name,
|
|
94
|
-
"strategy.artifacts.count": options.artifacts.length,
|
|
95
|
-
"strategy.chunk_size": this.config.chunkSize,
|
|
96
|
-
"strategy.concurrency": this.config.concurrency,
|
|
97
|
-
},
|
|
98
|
-
});
|
|
99
|
-
|
|
100
|
-
const batches = getBatches(
|
|
101
|
-
options.artifacts,
|
|
102
|
-
{
|
|
103
|
-
maxTokens: this.config.chunkSize,
|
|
104
|
-
maxImages: this.config.maxImages,
|
|
105
|
-
},
|
|
106
|
-
debug,
|
|
107
|
-
telemetry ?? undefined,
|
|
108
|
-
strategySpan,
|
|
109
|
-
);
|
|
110
|
-
|
|
111
|
-
const schema = serializeSchema(options.schema);
|
|
112
|
-
const totalSteps = this.getEstimatedSteps(options.artifacts);
|
|
113
|
-
let step = 1;
|
|
114
|
-
|
|
115
|
-
const tasks = batches.map((batch, index) => async () => {
|
|
116
|
-
const prompt = buildExtractorPrompt(
|
|
117
|
-
batch,
|
|
118
|
-
schema,
|
|
119
|
-
this.config.outputInstructions,
|
|
120
|
-
);
|
|
121
|
-
const result = await extractWithPrompt<T>({
|
|
122
|
-
model: this.config.model,
|
|
123
|
-
schema: options.schema,
|
|
124
|
-
system: prompt.system,
|
|
125
|
-
user: prompt.user,
|
|
126
|
-
artifacts: batch,
|
|
127
|
-
events: options.events,
|
|
128
|
-
execute: this.config.execute as never,
|
|
129
|
-
strict: options.strict ?? this.config.strict,
|
|
130
|
-
debug,
|
|
131
|
-
callId: `parallel_auto_batch_${index + 1}`,
|
|
132
|
-
telemetry: telemetry ?? undefined,
|
|
133
|
-
parentSpan: strategySpan,
|
|
134
|
-
});
|
|
135
|
-
step += 1;
|
|
136
|
-
await options.events?.onStep?.({
|
|
137
|
-
step,
|
|
138
|
-
total: totalSteps,
|
|
139
|
-
label: `batch ${index + 1}/${batches.length}`,
|
|
140
|
-
});
|
|
141
|
-
debug?.step({
|
|
142
|
-
step,
|
|
143
|
-
total: totalSteps,
|
|
144
|
-
label: `batch ${index + 1}/${batches.length}`,
|
|
145
|
-
strategy: this.name,
|
|
146
|
-
});
|
|
147
|
-
return result;
|
|
148
|
-
});
|
|
149
|
-
|
|
150
|
-
const results = await runConcurrently(
|
|
151
|
-
tasks,
|
|
152
|
-
this.config.concurrency ?? batches.length,
|
|
153
|
-
);
|
|
154
|
-
|
|
155
|
-
const merger = new SmartDataMerger(
|
|
156
|
-
options.schema as Record<string, unknown>,
|
|
157
|
-
);
|
|
158
|
-
let merged = {} as Record<string, unknown>;
|
|
159
|
-
|
|
160
|
-
debug?.mergeStart({
|
|
161
|
-
mergeId: "parallel_auto_smart_merge",
|
|
162
|
-
inputCount: results.length,
|
|
163
|
-
strategy: this.name,
|
|
164
|
-
});
|
|
165
|
-
|
|
166
|
-
// Create smart merge span
|
|
167
|
-
const mergeSpan = telemetry?.startSpan({
|
|
168
|
-
name: "struktur.smart_merge",
|
|
169
|
-
kind: "CHAIN",
|
|
170
|
-
parentSpan: strategySpan,
|
|
171
|
-
attributes: {
|
|
172
|
-
"merge.strategy": "smart",
|
|
173
|
-
"merge.input_count": results.length,
|
|
174
|
-
},
|
|
175
|
-
});
|
|
176
|
-
|
|
177
|
-
for (let i = 0; i < results.length; i++) {
|
|
178
|
-
const result = results[i]!;
|
|
179
|
-
const prevSize = Object.keys(merged).length;
|
|
180
|
-
merged = merger.merge(merged, result.data as Record<string, unknown>);
|
|
181
|
-
const newSize = Object.keys(merged).length;
|
|
182
|
-
|
|
183
|
-
// Log merge operation per field
|
|
184
|
-
for (const key of Object.keys(result.data as Record<string, unknown>)) {
|
|
185
|
-
const leftArray = Array.isArray(merged[key])
|
|
186
|
-
? (merged[key] as unknown[]).length
|
|
187
|
-
: undefined;
|
|
188
|
-
const rightArray = Array.isArray(
|
|
189
|
-
(result.data as Record<string, unknown>)[key],
|
|
190
|
-
)
|
|
191
|
-
? ((result.data as Record<string, unknown>)[key] as unknown[]).length
|
|
192
|
-
: undefined;
|
|
193
|
-
|
|
194
|
-
debug?.smartMergeField({
|
|
195
|
-
mergeId: "parallel_auto_smart_merge",
|
|
196
|
-
field: key,
|
|
197
|
-
operation: "merge_arrays",
|
|
198
|
-
leftCount: leftArray,
|
|
199
|
-
rightCount: rightArray,
|
|
200
|
-
});
|
|
201
|
-
|
|
202
|
-
// Record merge event in telemetry
|
|
203
|
-
if (mergeSpan && telemetry) {
|
|
204
|
-
telemetry.recordEvent(mergeSpan, {
|
|
205
|
-
type: "merge",
|
|
206
|
-
strategy: "smart",
|
|
207
|
-
inputCount: rightArray ?? 1,
|
|
208
|
-
outputCount: leftArray ?? 1,
|
|
209
|
-
});
|
|
210
|
-
}
|
|
211
|
-
}
|
|
212
|
-
}
|
|
213
|
-
|
|
214
|
-
debug?.mergeComplete({
|
|
215
|
-
mergeId: "parallel_auto_smart_merge",
|
|
216
|
-
success: true,
|
|
217
|
-
});
|
|
218
|
-
|
|
219
|
-
// End merge span
|
|
220
|
-
if (mergeSpan && telemetry) {
|
|
221
|
-
telemetry.endSpan(mergeSpan, {
|
|
222
|
-
status: "ok",
|
|
223
|
-
output: merged,
|
|
224
|
-
});
|
|
225
|
-
}
|
|
226
|
-
|
|
227
|
-
merged = dedupeArrays(merged);
|
|
228
|
-
|
|
229
|
-
// Create exact dedupe span
|
|
230
|
-
const exactDedupeSpan = telemetry?.startSpan({
|
|
231
|
-
name: "struktur.exact_dedupe",
|
|
232
|
-
kind: "CHAIN",
|
|
233
|
-
parentSpan: strategySpan,
|
|
234
|
-
attributes: {
|
|
235
|
-
"dedupe.method": "exact_hashing",
|
|
236
|
-
},
|
|
237
|
-
});
|
|
238
|
-
|
|
239
|
-
// End exact dedupe span
|
|
240
|
-
if (exactDedupeSpan && telemetry) {
|
|
241
|
-
telemetry.recordEvent(exactDedupeSpan, {
|
|
242
|
-
type: "merge",
|
|
243
|
-
strategy: "exact_hash_dedupe",
|
|
244
|
-
inputCount: Object.keys(merged).length,
|
|
245
|
-
outputCount: Object.keys(merged).length,
|
|
246
|
-
});
|
|
247
|
-
telemetry.endSpan(exactDedupeSpan, {
|
|
248
|
-
status: "ok",
|
|
249
|
-
output: merged,
|
|
250
|
-
});
|
|
251
|
-
}
|
|
252
|
-
|
|
253
|
-
const dedupePrompt = buildDeduplicationPrompt(schema, merged);
|
|
254
|
-
|
|
255
|
-
debug?.dedupeStart({
|
|
256
|
-
dedupeId: "parallel_auto_dedupe",
|
|
257
|
-
itemCount: Object.keys(merged).length,
|
|
258
|
-
});
|
|
259
|
-
|
|
260
|
-
// Create LLM dedupe span
|
|
261
|
-
const llmDedupeSpan = telemetry?.startSpan({
|
|
262
|
-
name: "struktur.llm_dedupe",
|
|
263
|
-
kind: "CHAIN",
|
|
264
|
-
parentSpan: strategySpan,
|
|
265
|
-
attributes: {
|
|
266
|
-
"dedupe.method": "llm",
|
|
267
|
-
},
|
|
268
|
-
});
|
|
269
|
-
|
|
270
|
-
const dedupeResponse = await runWithRetries<{ keys: string[] }>({
|
|
271
|
-
model: this.config.dedupeModel ?? this.config.model,
|
|
272
|
-
schema: dedupeSchema,
|
|
273
|
-
system: dedupePrompt.system,
|
|
274
|
-
user: dedupePrompt.user,
|
|
275
|
-
events: options.events,
|
|
276
|
-
execute: this.config.dedupeExecute,
|
|
277
|
-
strict: this.config.strict,
|
|
278
|
-
debug,
|
|
279
|
-
callId: "parallel_auto_dedupe",
|
|
280
|
-
telemetry: telemetry ?? undefined,
|
|
281
|
-
parentSpan: llmDedupeSpan,
|
|
282
|
-
});
|
|
283
|
-
|
|
284
|
-
step += 1;
|
|
285
|
-
await options.events?.onStep?.({
|
|
286
|
-
step,
|
|
287
|
-
total: totalSteps,
|
|
288
|
-
label: "dedupe",
|
|
289
|
-
});
|
|
290
|
-
debug?.step({
|
|
291
|
-
step,
|
|
292
|
-
total: totalSteps,
|
|
293
|
-
label: "dedupe",
|
|
294
|
-
strategy: this.name,
|
|
295
|
-
});
|
|
296
|
-
|
|
297
|
-
let deduped = merged;
|
|
298
|
-
for (const key of dedupeResponse.data.keys) {
|
|
299
|
-
deduped = removeByPath(deduped, key);
|
|
300
|
-
}
|
|
301
|
-
|
|
302
|
-
debug?.dedupeComplete({
|
|
303
|
-
dedupeId: "parallel_auto_dedupe",
|
|
304
|
-
duplicatesFound: dedupeResponse.data.keys.length,
|
|
305
|
-
itemsRemoved: dedupeResponse.data.keys.length,
|
|
306
|
-
});
|
|
307
|
-
|
|
308
|
-
// End LLM dedupe span
|
|
309
|
-
if (llmDedupeSpan && telemetry) {
|
|
310
|
-
telemetry.recordEvent(llmDedupeSpan, {
|
|
311
|
-
type: "merge",
|
|
312
|
-
strategy: "llm_dedupe",
|
|
313
|
-
inputCount: Object.keys(merged).length,
|
|
314
|
-
outputCount: Object.keys(deduped).length,
|
|
315
|
-
deduped: dedupeResponse.data.keys.length,
|
|
316
|
-
});
|
|
317
|
-
telemetry.endSpan(llmDedupeSpan, {
|
|
318
|
-
status: "ok",
|
|
319
|
-
output: deduped,
|
|
320
|
-
});
|
|
321
|
-
}
|
|
322
|
-
|
|
323
|
-
// End strategy span
|
|
324
|
-
telemetry?.endSpan(strategySpan!, {
|
|
325
|
-
status: "ok",
|
|
326
|
-
output: deduped,
|
|
327
|
-
});
|
|
328
|
-
|
|
329
|
-
return {
|
|
330
|
-
data: deduped as T,
|
|
331
|
-
usage: mergeUsage([...results.map((r) => r.usage), dedupeResponse.usage]),
|
|
332
|
-
};
|
|
333
|
-
}
|
|
334
|
-
}
|
|
335
|
-
|
|
336
|
-
export const parallelAutoMerge = <T>(
|
|
337
|
-
config: ParallelAutoMergeStrategyConfig,
|
|
338
|
-
) => {
|
|
339
|
-
return new ParallelAutoMergeStrategy<T>(config);
|
|
340
|
-
};
|
|
341
|
-
|
|
342
|
-
export const __testing__ = {
|
|
343
|
-
dedupeArrays,
|
|
344
|
-
removeByPath,
|
|
345
|
-
};
|
|
@@ -1,61 +0,0 @@
|
|
|
1
|
-
import { test, expect } from "bun:test";
|
|
2
|
-
import type { JSONSchemaType } from "ajv";
|
|
3
|
-
import { ParallelStrategy } from "./ParallelStrategy";
|
|
4
|
-
import type { Artifact, ExtractionOptions } from "../types";
|
|
5
|
-
|
|
6
|
-
type Output = { title: string };
|
|
7
|
-
|
|
8
|
-
const schema: JSONSchemaType<Output> = {
|
|
9
|
-
type: "object",
|
|
10
|
-
properties: { title: { type: "string" } },
|
|
11
|
-
required: ["title"],
|
|
12
|
-
additionalProperties: false,
|
|
13
|
-
};
|
|
14
|
-
|
|
15
|
-
const artifacts: Artifact[] = [
|
|
16
|
-
{
|
|
17
|
-
id: "a1",
|
|
18
|
-
type: "text",
|
|
19
|
-
raw: async () => Buffer.from(""),
|
|
20
|
-
contents: [{ text: "abcdefgh" }],
|
|
21
|
-
},
|
|
22
|
-
{
|
|
23
|
-
id: "a2",
|
|
24
|
-
type: "text",
|
|
25
|
-
raw: async () => Buffer.from(""),
|
|
26
|
-
contents: [{ text: "abcdefgh" }],
|
|
27
|
-
},
|
|
28
|
-
];
|
|
29
|
-
|
|
30
|
-
test("ParallelStrategy merges batch results", async () => {
|
|
31
|
-
let calls = 0;
|
|
32
|
-
const strategy = new ParallelStrategy<Output>({
|
|
33
|
-
model: {},
|
|
34
|
-
mergeModel: {},
|
|
35
|
-
chunkSize: 2,
|
|
36
|
-
execute: (async (request: any) => {
|
|
37
|
-
calls += 1;
|
|
38
|
-
const userText = typeof request.user === "string" ? request.user : "";
|
|
39
|
-
if (userText.includes("<json-objects>")) {
|
|
40
|
-
return {
|
|
41
|
-
data: { title: "merged" },
|
|
42
|
-
usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 },
|
|
43
|
-
};
|
|
44
|
-
}
|
|
45
|
-
return {
|
|
46
|
-
data: { title: "chunk" },
|
|
47
|
-
usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 },
|
|
48
|
-
};
|
|
49
|
-
}) as any,
|
|
50
|
-
});
|
|
51
|
-
|
|
52
|
-
const options: ExtractionOptions<Output> = {
|
|
53
|
-
artifacts,
|
|
54
|
-
schema,
|
|
55
|
-
strategy,
|
|
56
|
-
};
|
|
57
|
-
|
|
58
|
-
const result = await strategy.run(options);
|
|
59
|
-
expect(result.data.title).toBe("merged");
|
|
60
|
-
expect(calls).toBe(3);
|
|
61
|
-
});
|
|
@@ -1,208 +0,0 @@
|
|
|
1
|
-
import type { ExtractionResult, ExtractionStrategy } from "../types";
|
|
2
|
-
import type { ExtractionOptions } from "../types";
|
|
3
|
-
import { buildExtractorPrompt } from "../prompts/ExtractorPrompt";
|
|
4
|
-
import { buildParallelMergerPrompt } from "../prompts/ParallelMergerPrompt";
|
|
5
|
-
import {
|
|
6
|
-
extractWithPrompt,
|
|
7
|
-
getBatches,
|
|
8
|
-
mergeUsage,
|
|
9
|
-
serializeSchema,
|
|
10
|
-
} from "./utils";
|
|
11
|
-
import { runConcurrently } from "./concurrency";
|
|
12
|
-
import { runWithRetries } from "../llm/RetryingRunner";
|
|
13
|
-
|
|
14
|
-
export type ParallelStrategyConfig = {
|
|
15
|
-
model: unknown;
|
|
16
|
-
mergeModel: unknown;
|
|
17
|
-
chunkSize: number;
|
|
18
|
-
concurrency?: number;
|
|
19
|
-
maxImages?: number;
|
|
20
|
-
outputInstructions?: string;
|
|
21
|
-
execute?: typeof runWithRetries;
|
|
22
|
-
strict?: boolean;
|
|
23
|
-
};
|
|
24
|
-
|
|
25
|
-
export class ParallelStrategy<T> implements ExtractionStrategy<T> {
|
|
26
|
-
public name = "parallel";
|
|
27
|
-
private config: ParallelStrategyConfig;
|
|
28
|
-
|
|
29
|
-
constructor(config: ParallelStrategyConfig) {
|
|
30
|
-
this.config = config;
|
|
31
|
-
}
|
|
32
|
-
|
|
33
|
-
getEstimatedSteps(artifacts: ExtractionOptions<T>["artifacts"]): number {
|
|
34
|
-
const batches = getBatches(artifacts, {
|
|
35
|
-
maxTokens: this.config.chunkSize,
|
|
36
|
-
maxImages: this.config.maxImages,
|
|
37
|
-
});
|
|
38
|
-
return batches.length + 3;
|
|
39
|
-
}
|
|
40
|
-
|
|
41
|
-
async run(options: ExtractionOptions<T>): Promise<ExtractionResult<T>> {
|
|
42
|
-
const debug = options.debug;
|
|
43
|
-
const { telemetry } = options;
|
|
44
|
-
|
|
45
|
-
// Create strategy-level span
|
|
46
|
-
const strategySpan = telemetry?.startSpan({
|
|
47
|
-
name: "strategy.parallel",
|
|
48
|
-
kind: "CHAIN",
|
|
49
|
-
attributes: {
|
|
50
|
-
"strategy.name": this.name,
|
|
51
|
-
"strategy.artifacts.count": options.artifacts.length,
|
|
52
|
-
"strategy.chunk_size": this.config.chunkSize,
|
|
53
|
-
"strategy.concurrency": this.config.concurrency,
|
|
54
|
-
},
|
|
55
|
-
});
|
|
56
|
-
|
|
57
|
-
const batches = getBatches(
|
|
58
|
-
options.artifacts,
|
|
59
|
-
{
|
|
60
|
-
maxTokens: this.config.chunkSize,
|
|
61
|
-
maxImages: this.config.maxImages,
|
|
62
|
-
},
|
|
63
|
-
debug,
|
|
64
|
-
telemetry ?? undefined,
|
|
65
|
-
strategySpan,
|
|
66
|
-
);
|
|
67
|
-
|
|
68
|
-
const schema = serializeSchema(options.schema);
|
|
69
|
-
const totalSteps = this.getEstimatedSteps(options.artifacts);
|
|
70
|
-
let step = 1;
|
|
71
|
-
|
|
72
|
-
// Emit start event
|
|
73
|
-
await options.events?.onStep?.({
|
|
74
|
-
step,
|
|
75
|
-
total: totalSteps,
|
|
76
|
-
label: batches.length > 1 ? `batch 1/${batches.length}` : "extract",
|
|
77
|
-
});
|
|
78
|
-
debug?.step({
|
|
79
|
-
step,
|
|
80
|
-
total: totalSteps,
|
|
81
|
-
label: batches.length > 1 ? `batch 1/${batches.length}` : "extract",
|
|
82
|
-
strategy: this.name,
|
|
83
|
-
});
|
|
84
|
-
|
|
85
|
-
const tasks = batches.map((batch, index) => async () => {
|
|
86
|
-
const prompt = buildExtractorPrompt(
|
|
87
|
-
batch,
|
|
88
|
-
schema,
|
|
89
|
-
this.config.outputInstructions,
|
|
90
|
-
);
|
|
91
|
-
const result = await extractWithPrompt<T>({
|
|
92
|
-
model: this.config.model,
|
|
93
|
-
schema: options.schema,
|
|
94
|
-
system: prompt.system,
|
|
95
|
-
user: prompt.user,
|
|
96
|
-
artifacts: batch,
|
|
97
|
-
events: options.events,
|
|
98
|
-
execute: this.config.execute as never,
|
|
99
|
-
strict: options.strict ?? this.config.strict,
|
|
100
|
-
debug,
|
|
101
|
-
callId: `parallel_batch_${index + 1}`,
|
|
102
|
-
telemetry: telemetry ?? undefined,
|
|
103
|
-
parentSpan: strategySpan,
|
|
104
|
-
});
|
|
105
|
-
// Emit progress after batch completes (if there are more batches)
|
|
106
|
-
const completedIndex = index + 1;
|
|
107
|
-
if (completedIndex < batches.length) {
|
|
108
|
-
step += 1;
|
|
109
|
-
await options.events?.onStep?.({
|
|
110
|
-
step,
|
|
111
|
-
total: totalSteps,
|
|
112
|
-
label: `batch ${completedIndex + 1}/${batches.length}`,
|
|
113
|
-
});
|
|
114
|
-
debug?.step({
|
|
115
|
-
step,
|
|
116
|
-
total: totalSteps,
|
|
117
|
-
label: `batch ${completedIndex + 1}/${batches.length}`,
|
|
118
|
-
strategy: this.name,
|
|
119
|
-
});
|
|
120
|
-
}
|
|
121
|
-
return result;
|
|
122
|
-
});
|
|
123
|
-
|
|
124
|
-
const results = await runConcurrently(
|
|
125
|
-
tasks,
|
|
126
|
-
this.config.concurrency ?? batches.length,
|
|
127
|
-
);
|
|
128
|
-
|
|
129
|
-
debug?.mergeStart({
|
|
130
|
-
mergeId: "parallel_merge",
|
|
131
|
-
inputCount: results.length,
|
|
132
|
-
strategy: this.name,
|
|
133
|
-
});
|
|
134
|
-
|
|
135
|
-
// Create merge span
|
|
136
|
-
const mergeSpan = telemetry?.startSpan({
|
|
137
|
-
name: "struktur.merge",
|
|
138
|
-
kind: "CHAIN",
|
|
139
|
-
parentSpan: strategySpan,
|
|
140
|
-
attributes: {
|
|
141
|
-
"merge.strategy": "parallel",
|
|
142
|
-
"merge.input_count": results.length,
|
|
143
|
-
},
|
|
144
|
-
});
|
|
145
|
-
|
|
146
|
-
const mergePrompt = buildParallelMergerPrompt(
|
|
147
|
-
schema,
|
|
148
|
-
results.map((r) => r.data),
|
|
149
|
-
);
|
|
150
|
-
const merged = await extractWithPrompt<T>({
|
|
151
|
-
model: this.config.mergeModel,
|
|
152
|
-
schema: options.schema,
|
|
153
|
-
system: mergePrompt.system,
|
|
154
|
-
user: mergePrompt.user,
|
|
155
|
-
artifacts: [],
|
|
156
|
-
events: options.events,
|
|
157
|
-
execute: this.config.execute as never,
|
|
158
|
-
strict: this.config.strict,
|
|
159
|
-
debug,
|
|
160
|
-
callId: "parallel_merge",
|
|
161
|
-
telemetry: telemetry ?? undefined,
|
|
162
|
-
parentSpan: mergeSpan,
|
|
163
|
-
});
|
|
164
|
-
|
|
165
|
-
step += 1;
|
|
166
|
-
await options.events?.onStep?.({
|
|
167
|
-
step,
|
|
168
|
-
total: totalSteps,
|
|
169
|
-
label: "merge",
|
|
170
|
-
});
|
|
171
|
-
debug?.step({
|
|
172
|
-
step,
|
|
173
|
-
total: totalSteps,
|
|
174
|
-
label: "merge",
|
|
175
|
-
strategy: this.name,
|
|
176
|
-
});
|
|
177
|
-
debug?.mergeComplete({ mergeId: "parallel_merge", success: true });
|
|
178
|
-
|
|
179
|
-
// End merge span
|
|
180
|
-
if (mergeSpan && telemetry) {
|
|
181
|
-
telemetry.recordEvent(mergeSpan, {
|
|
182
|
-
type: "merge",
|
|
183
|
-
strategy: "parallel",
|
|
184
|
-
inputCount: results.length,
|
|
185
|
-
outputCount: 1,
|
|
186
|
-
});
|
|
187
|
-
telemetry.endSpan(mergeSpan, {
|
|
188
|
-
status: "ok",
|
|
189
|
-
output: merged.data,
|
|
190
|
-
});
|
|
191
|
-
}
|
|
192
|
-
|
|
193
|
-
// End strategy span
|
|
194
|
-
telemetry?.endSpan(strategySpan!, {
|
|
195
|
-
status: "ok",
|
|
196
|
-
output: merged.data,
|
|
197
|
-
});
|
|
198
|
-
|
|
199
|
-
return {
|
|
200
|
-
data: merged.data,
|
|
201
|
-
usage: mergeUsage([...results.map((r) => r.usage), merged.usage]),
|
|
202
|
-
};
|
|
203
|
-
}
|
|
204
|
-
}
|
|
205
|
-
|
|
206
|
-
export const parallel = <T>(config: ParallelStrategyConfig) => {
|
|
207
|
-
return new ParallelStrategy<T>(config);
|
|
208
|
-
};
|
|
@@ -1,66 +0,0 @@
|
|
|
1
|
-
import { test, expect } from "bun:test";
|
|
2
|
-
import type { JSONSchemaType } from "ajv";
|
|
3
|
-
import { SequentialAutoMergeStrategy } from "./SequentialAutoMergeStrategy";
|
|
4
|
-
import type { Artifact, ExtractionOptions } from "../types";
|
|
5
|
-
|
|
6
|
-
type Output = { items: Array<{ id: number }> };
|
|
7
|
-
|
|
8
|
-
const schema: JSONSchemaType<Output> = {
|
|
9
|
-
type: "object",
|
|
10
|
-
properties: {
|
|
11
|
-
items: {
|
|
12
|
-
type: "array",
|
|
13
|
-
items: {
|
|
14
|
-
type: "object",
|
|
15
|
-
properties: { id: { type: "number" } },
|
|
16
|
-
required: ["id"],
|
|
17
|
-
additionalProperties: false,
|
|
18
|
-
},
|
|
19
|
-
},
|
|
20
|
-
},
|
|
21
|
-
required: ["items"],
|
|
22
|
-
additionalProperties: false,
|
|
23
|
-
};
|
|
24
|
-
|
|
25
|
-
const artifacts: Artifact[] = [
|
|
26
|
-
{
|
|
27
|
-
id: "a1",
|
|
28
|
-
type: "text",
|
|
29
|
-
raw: async () => Buffer.from(""),
|
|
30
|
-
contents: [{ text: "abcdefgh" }],
|
|
31
|
-
},
|
|
32
|
-
{
|
|
33
|
-
id: "a2",
|
|
34
|
-
type: "text",
|
|
35
|
-
raw: async () => Buffer.from(""),
|
|
36
|
-
contents: [{ text: "abcdefgh" }],
|
|
37
|
-
},
|
|
38
|
-
];
|
|
39
|
-
|
|
40
|
-
test("SequentialAutoMergeStrategy merges and dedupes", async () => {
|
|
41
|
-
const strategy = new SequentialAutoMergeStrategy<Output>({
|
|
42
|
-
model: {},
|
|
43
|
-
chunkSize: 2,
|
|
44
|
-
execute: (async () => {
|
|
45
|
-
return {
|
|
46
|
-
data: { items: [{ id: 1 }, { id: 1 }] },
|
|
47
|
-
usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 },
|
|
48
|
-
};
|
|
49
|
-
}) as any,
|
|
50
|
-
dedupeExecute: (async () => {
|
|
51
|
-
return {
|
|
52
|
-
data: { keys: [] },
|
|
53
|
-
usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 },
|
|
54
|
-
};
|
|
55
|
-
}) as any,
|
|
56
|
-
});
|
|
57
|
-
|
|
58
|
-
const options: ExtractionOptions<Output> = {
|
|
59
|
-
artifacts,
|
|
60
|
-
schema,
|
|
61
|
-
strategy,
|
|
62
|
-
};
|
|
63
|
-
|
|
64
|
-
const result = await strategy.run(options);
|
|
65
|
-
expect(result.data.items.length).toBe(1);
|
|
66
|
-
});
|