@struktur/sdk 2.1.2 → 2.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (200) hide show
  1. package/dist/artifacts/fileToArtifact.d.ts +8 -0
  2. package/dist/artifacts/fileToArtifact.d.ts.map +1 -0
  3. package/dist/artifacts/input.d.ts +60 -0
  4. package/dist/artifacts/input.d.ts.map +1 -0
  5. package/{src/artifacts/providers.ts → dist/artifacts/providers.d.ts} +2 -4
  6. package/dist/artifacts/providers.d.ts.map +1 -0
  7. package/dist/artifacts/urlToArtifact.d.ts +3 -0
  8. package/dist/artifacts/urlToArtifact.d.ts.map +1 -0
  9. package/dist/auth/config.d.ts +34 -0
  10. package/dist/auth/config.d.ts.map +1 -0
  11. package/dist/auth/tokens.d.ts +18 -0
  12. package/dist/auth/tokens.d.ts.map +1 -0
  13. package/dist/chunking/ArtifactBatcher.d.ts +11 -0
  14. package/dist/chunking/ArtifactBatcher.d.ts.map +1 -0
  15. package/dist/chunking/ArtifactSplitter.d.ts +10 -0
  16. package/dist/chunking/ArtifactSplitter.d.ts.map +1 -0
  17. package/dist/debug/logger.d.ts +169 -0
  18. package/dist/debug/logger.d.ts.map +1 -0
  19. package/dist/extract.d.ts +3 -0
  20. package/dist/extract.d.ts.map +1 -0
  21. package/dist/fields.d.ts +75 -0
  22. package/dist/fields.d.ts.map +1 -0
  23. package/dist/index.d.ts +24 -0
  24. package/dist/index.d.ts.map +1 -0
  25. package/dist/index.js +5603 -0
  26. package/dist/index.js.map +1 -0
  27. package/dist/llm/LLMClient.d.ts +40 -0
  28. package/dist/llm/LLMClient.d.ts.map +1 -0
  29. package/dist/llm/RetryingRunner.d.ts +37 -0
  30. package/dist/llm/RetryingRunner.d.ts.map +1 -0
  31. package/dist/llm/message.d.ts +12 -0
  32. package/dist/llm/message.d.ts.map +1 -0
  33. package/dist/llm/models.d.ts +13 -0
  34. package/dist/llm/models.d.ts.map +1 -0
  35. package/dist/llm/resolveModel.d.ts +3 -0
  36. package/dist/llm/resolveModel.d.ts.map +1 -0
  37. package/dist/merge/Deduplicator.d.ts +4 -0
  38. package/dist/merge/Deduplicator.d.ts.map +1 -0
  39. package/dist/merge/SmartDataMerger.d.ts +7 -0
  40. package/dist/merge/SmartDataMerger.d.ts.map +1 -0
  41. package/dist/parsers/collect.d.ts +7 -0
  42. package/dist/parsers/collect.d.ts.map +1 -0
  43. package/{src/parsers/index.ts → dist/parsers/index.d.ts} +1 -0
  44. package/dist/parsers/index.d.ts.map +1 -0
  45. package/dist/parsers/mime.d.ts +12 -0
  46. package/dist/parsers/mime.d.ts.map +1 -0
  47. package/dist/parsers/npm.d.ts +16 -0
  48. package/dist/parsers/npm.d.ts.map +1 -0
  49. package/dist/parsers/pdf.d.ts +36 -0
  50. package/dist/parsers/pdf.d.ts.map +1 -0
  51. package/dist/parsers/runner.d.ts +4 -0
  52. package/dist/parsers/runner.d.ts.map +1 -0
  53. package/dist/parsers/types.d.ts +27 -0
  54. package/dist/parsers/types.d.ts.map +1 -0
  55. package/dist/parsers.d.ts +1 -0
  56. package/dist/parsers.js +492 -0
  57. package/dist/parsers.js.map +1 -0
  58. package/dist/prompts/DeduplicationPrompt.d.ts +5 -0
  59. package/dist/prompts/DeduplicationPrompt.d.ts.map +1 -0
  60. package/dist/prompts/ExtractorPrompt.d.ts +6 -0
  61. package/dist/prompts/ExtractorPrompt.d.ts.map +1 -0
  62. package/dist/prompts/ParallelMergerPrompt.d.ts +5 -0
  63. package/dist/prompts/ParallelMergerPrompt.d.ts.map +1 -0
  64. package/dist/prompts/SequentialExtractorPrompt.d.ts +6 -0
  65. package/dist/prompts/SequentialExtractorPrompt.d.ts.map +1 -0
  66. package/dist/prompts/formatArtifacts.d.ts +3 -0
  67. package/dist/prompts/formatArtifacts.d.ts.map +1 -0
  68. package/dist/strategies/DoublePassAutoMergeStrategy.d.ts +23 -0
  69. package/dist/strategies/DoublePassAutoMergeStrategy.d.ts.map +1 -0
  70. package/dist/strategies/DoublePassStrategy.d.ts +22 -0
  71. package/dist/strategies/DoublePassStrategy.d.ts.map +1 -0
  72. package/dist/strategies/ParallelAutoMergeStrategy.d.ts +27 -0
  73. package/dist/strategies/ParallelAutoMergeStrategy.d.ts.map +1 -0
  74. package/dist/strategies/ParallelStrategy.d.ts +22 -0
  75. package/dist/strategies/ParallelStrategy.d.ts.map +1 -0
  76. package/dist/strategies/SequentialAutoMergeStrategy.d.ts +22 -0
  77. package/dist/strategies/SequentialAutoMergeStrategy.d.ts.map +1 -0
  78. package/dist/strategies/SequentialStrategy.d.ts +20 -0
  79. package/dist/strategies/SequentialStrategy.d.ts.map +1 -0
  80. package/dist/strategies/SimpleStrategy.d.ts +18 -0
  81. package/dist/strategies/SimpleStrategy.d.ts.map +1 -0
  82. package/dist/strategies/agent/AgentStrategy.d.ts +44 -0
  83. package/dist/strategies/agent/AgentStrategy.d.ts.map +1 -0
  84. package/dist/strategies/agent/AgentTools.d.ts +55 -0
  85. package/dist/strategies/agent/AgentTools.d.ts.map +1 -0
  86. package/dist/strategies/agent/ArtifactFilesystem.d.ts +51 -0
  87. package/dist/strategies/agent/ArtifactFilesystem.d.ts.map +1 -0
  88. package/dist/strategies/agent/index.d.ts +4 -0
  89. package/dist/strategies/agent/index.d.ts.map +1 -0
  90. package/dist/strategies/concurrency.d.ts +2 -0
  91. package/dist/strategies/concurrency.d.ts.map +1 -0
  92. package/{src/strategies/index.ts → dist/strategies/index.d.ts} +2 -0
  93. package/dist/strategies/index.d.ts.map +1 -0
  94. package/dist/strategies/utils.d.ts +39 -0
  95. package/dist/strategies/utils.d.ts.map +1 -0
  96. package/dist/strategies.d.ts +1 -0
  97. package/dist/strategies.js +3930 -0
  98. package/dist/strategies.js.map +1 -0
  99. package/dist/tokenization.d.ts +11 -0
  100. package/dist/tokenization.d.ts.map +1 -0
  101. package/dist/types.d.ts +178 -0
  102. package/dist/types.d.ts.map +1 -0
  103. package/dist/validation/validator.d.ts +20 -0
  104. package/dist/validation/validator.d.ts.map +1 -0
  105. package/package.json +30 -14
  106. package/src/agent-cli-integration.test.ts +0 -47
  107. package/src/agent-export.test.ts +0 -17
  108. package/src/agent-tool-labels.test.ts +0 -50
  109. package/src/artifacts/AGENTS.md +0 -16
  110. package/src/artifacts/fileToArtifact.test.ts +0 -37
  111. package/src/artifacts/fileToArtifact.ts +0 -44
  112. package/src/artifacts/input.test.ts +0 -243
  113. package/src/artifacts/input.ts +0 -360
  114. package/src/artifacts/providers.test.ts +0 -19
  115. package/src/artifacts/urlToArtifact.test.ts +0 -23
  116. package/src/artifacts/urlToArtifact.ts +0 -19
  117. package/src/auth/AGENTS.md +0 -11
  118. package/src/auth/config.test.ts +0 -132
  119. package/src/auth/config.ts +0 -186
  120. package/src/auth/tokens.test.ts +0 -58
  121. package/src/auth/tokens.ts +0 -229
  122. package/src/chunking/AGENTS.md +0 -11
  123. package/src/chunking/ArtifactBatcher.test.ts +0 -22
  124. package/src/chunking/ArtifactBatcher.ts +0 -110
  125. package/src/chunking/ArtifactSplitter.test.ts +0 -38
  126. package/src/chunking/ArtifactSplitter.ts +0 -151
  127. package/src/debug/AGENTS.md +0 -79
  128. package/src/debug/logger.test.ts +0 -244
  129. package/src/debug/logger.ts +0 -211
  130. package/src/extract.test.ts +0 -22
  131. package/src/extract.ts +0 -150
  132. package/src/fields.test.ts +0 -681
  133. package/src/fields.ts +0 -246
  134. package/src/index.test.ts +0 -20
  135. package/src/index.ts +0 -110
  136. package/src/llm/AGENTS.md +0 -9
  137. package/src/llm/LLMClient.test.ts +0 -394
  138. package/src/llm/LLMClient.ts +0 -264
  139. package/src/llm/RetryingRunner.test.ts +0 -174
  140. package/src/llm/RetryingRunner.ts +0 -270
  141. package/src/llm/message.test.ts +0 -42
  142. package/src/llm/message.ts +0 -47
  143. package/src/llm/models.test.ts +0 -82
  144. package/src/llm/models.ts +0 -190
  145. package/src/llm/resolveModel.ts +0 -86
  146. package/src/merge/AGENTS.md +0 -6
  147. package/src/merge/Deduplicator.test.ts +0 -108
  148. package/src/merge/Deduplicator.ts +0 -45
  149. package/src/merge/SmartDataMerger.test.ts +0 -177
  150. package/src/merge/SmartDataMerger.ts +0 -56
  151. package/src/parsers/AGENTS.md +0 -58
  152. package/src/parsers/collect.test.ts +0 -56
  153. package/src/parsers/collect.ts +0 -31
  154. package/src/parsers/mime.test.ts +0 -91
  155. package/src/parsers/mime.ts +0 -137
  156. package/src/parsers/npm.ts +0 -26
  157. package/src/parsers/pdf.test.ts +0 -394
  158. package/src/parsers/pdf.ts +0 -194
  159. package/src/parsers/runner.test.ts +0 -95
  160. package/src/parsers/runner.ts +0 -177
  161. package/src/parsers/types.ts +0 -29
  162. package/src/prompts/AGENTS.md +0 -8
  163. package/src/prompts/DeduplicationPrompt.test.ts +0 -41
  164. package/src/prompts/DeduplicationPrompt.ts +0 -37
  165. package/src/prompts/ExtractorPrompt.test.ts +0 -21
  166. package/src/prompts/ExtractorPrompt.ts +0 -72
  167. package/src/prompts/ParallelMergerPrompt.test.ts +0 -8
  168. package/src/prompts/ParallelMergerPrompt.ts +0 -37
  169. package/src/prompts/SequentialExtractorPrompt.test.ts +0 -24
  170. package/src/prompts/SequentialExtractorPrompt.ts +0 -82
  171. package/src/prompts/formatArtifacts.test.ts +0 -39
  172. package/src/prompts/formatArtifacts.ts +0 -46
  173. package/src/strategies/AGENTS.md +0 -6
  174. package/src/strategies/DoublePassAutoMergeStrategy.test.ts +0 -53
  175. package/src/strategies/DoublePassAutoMergeStrategy.ts +0 -410
  176. package/src/strategies/DoublePassStrategy.test.ts +0 -48
  177. package/src/strategies/DoublePassStrategy.ts +0 -266
  178. package/src/strategies/ParallelAutoMergeStrategy.test.ts +0 -152
  179. package/src/strategies/ParallelAutoMergeStrategy.ts +0 -345
  180. package/src/strategies/ParallelStrategy.test.ts +0 -61
  181. package/src/strategies/ParallelStrategy.ts +0 -208
  182. package/src/strategies/SequentialAutoMergeStrategy.test.ts +0 -66
  183. package/src/strategies/SequentialAutoMergeStrategy.ts +0 -325
  184. package/src/strategies/SequentialStrategy.test.ts +0 -53
  185. package/src/strategies/SequentialStrategy.ts +0 -142
  186. package/src/strategies/SimpleStrategy.test.ts +0 -46
  187. package/src/strategies/SimpleStrategy.ts +0 -94
  188. package/src/strategies/concurrency.test.ts +0 -16
  189. package/src/strategies/concurrency.ts +0 -14
  190. package/src/strategies/index.test.ts +0 -20
  191. package/src/strategies/utils.test.ts +0 -76
  192. package/src/strategies/utils.ts +0 -95
  193. package/src/tokenization.test.ts +0 -119
  194. package/src/tokenization.ts +0 -71
  195. package/src/types.test.ts +0 -25
  196. package/src/types.ts +0 -174
  197. package/src/validation/AGENTS.md +0 -7
  198. package/src/validation/validator.test.ts +0 -204
  199. package/src/validation/validator.ts +0 -90
  200. package/tsconfig.json +0 -22
@@ -1,345 +0,0 @@
1
- import type { ExtractionResult, ExtractionStrategy } from "../types";
2
- import type { ExtractionOptions } from "../types";
3
- import { buildExtractorPrompt } from "../prompts/ExtractorPrompt";
4
- import { buildDeduplicationPrompt } from "../prompts/DeduplicationPrompt";
5
- import {
6
- extractWithPrompt,
7
- getBatches,
8
- mergeUsage,
9
- serializeSchema,
10
- } from "./utils";
11
- import { runConcurrently } from "./concurrency";
12
- import { SmartDataMerger } from "../merge/SmartDataMerger";
13
- import {
14
- findExactDuplicatesWithHashing,
15
- deduplicateByIndices,
16
- } from "../merge/Deduplicator";
17
- import { runWithRetries } from "../llm/RetryingRunner";
18
-
19
- export type ParallelAutoMergeStrategyConfig = {
20
- model: unknown;
21
- chunkSize: number;
22
- concurrency?: number;
23
- maxImages?: number;
24
- outputInstructions?: string;
25
- dedupeModel?: unknown;
26
- execute?: typeof runWithRetries;
27
- dedupeExecute?: typeof runWithRetries;
28
- strict?: boolean;
29
- };
30
-
31
- const dedupeSchema = {
32
- type: "object",
33
- properties: {
34
- keys: { type: "array", items: { type: "string" } },
35
- },
36
- required: ["keys"],
37
- additionalProperties: false,
38
- } as const;
39
-
40
- const dedupeArrays = (data: Record<string, unknown>) => {
41
- const result: Record<string, unknown> = { ...data };
42
- for (const [key, value] of Object.entries(result)) {
43
- if (Array.isArray(value)) {
44
- const duplicates = findExactDuplicatesWithHashing(value);
45
- result[key] = deduplicateByIndices(value, duplicates);
46
- }
47
- }
48
- return result;
49
- };
50
-
51
- const removeByPath = (data: Record<string, unknown>, path: string) => {
52
- const [root, indexStr] = path.split(".");
53
- const index = Number(indexStr);
54
- if (!root || Number.isNaN(index)) {
55
- return data;
56
- }
57
-
58
- const value = data[root];
59
- if (!Array.isArray(value)) {
60
- return data;
61
- }
62
-
63
- const next = [...value];
64
- next.splice(index, 1);
65
- return { ...data, [root]: next };
66
- };
67
-
68
- export class ParallelAutoMergeStrategy<T> implements ExtractionStrategy<T> {
69
- public name = "parallel-auto-merge";
70
- private config: ParallelAutoMergeStrategyConfig;
71
-
72
- constructor(config: ParallelAutoMergeStrategyConfig) {
73
- this.config = config;
74
- }
75
-
76
- getEstimatedSteps(artifacts: ExtractionOptions<T>["artifacts"]): number {
77
- const batches = getBatches(artifacts, {
78
- maxTokens: this.config.chunkSize,
79
- maxImages: this.config.maxImages,
80
- });
81
- return batches.length + 3;
82
- }
83
-
84
- async run(options: ExtractionOptions<T>): Promise<ExtractionResult<T>> {
85
- const debug = options.debug;
86
- const { telemetry } = options;
87
-
88
- // Create strategy-level span
89
- const strategySpan = telemetry?.startSpan({
90
- name: "strategy.parallel-auto-merge",
91
- kind: "CHAIN",
92
- attributes: {
93
- "strategy.name": this.name,
94
- "strategy.artifacts.count": options.artifacts.length,
95
- "strategy.chunk_size": this.config.chunkSize,
96
- "strategy.concurrency": this.config.concurrency,
97
- },
98
- });
99
-
100
- const batches = getBatches(
101
- options.artifacts,
102
- {
103
- maxTokens: this.config.chunkSize,
104
- maxImages: this.config.maxImages,
105
- },
106
- debug,
107
- telemetry ?? undefined,
108
- strategySpan,
109
- );
110
-
111
- const schema = serializeSchema(options.schema);
112
- const totalSteps = this.getEstimatedSteps(options.artifacts);
113
- let step = 1;
114
-
115
- const tasks = batches.map((batch, index) => async () => {
116
- const prompt = buildExtractorPrompt(
117
- batch,
118
- schema,
119
- this.config.outputInstructions,
120
- );
121
- const result = await extractWithPrompt<T>({
122
- model: this.config.model,
123
- schema: options.schema,
124
- system: prompt.system,
125
- user: prompt.user,
126
- artifacts: batch,
127
- events: options.events,
128
- execute: this.config.execute as never,
129
- strict: options.strict ?? this.config.strict,
130
- debug,
131
- callId: `parallel_auto_batch_${index + 1}`,
132
- telemetry: telemetry ?? undefined,
133
- parentSpan: strategySpan,
134
- });
135
- step += 1;
136
- await options.events?.onStep?.({
137
- step,
138
- total: totalSteps,
139
- label: `batch ${index + 1}/${batches.length}`,
140
- });
141
- debug?.step({
142
- step,
143
- total: totalSteps,
144
- label: `batch ${index + 1}/${batches.length}`,
145
- strategy: this.name,
146
- });
147
- return result;
148
- });
149
-
150
- const results = await runConcurrently(
151
- tasks,
152
- this.config.concurrency ?? batches.length,
153
- );
154
-
155
- const merger = new SmartDataMerger(
156
- options.schema as Record<string, unknown>,
157
- );
158
- let merged = {} as Record<string, unknown>;
159
-
160
- debug?.mergeStart({
161
- mergeId: "parallel_auto_smart_merge",
162
- inputCount: results.length,
163
- strategy: this.name,
164
- });
165
-
166
- // Create smart merge span
167
- const mergeSpan = telemetry?.startSpan({
168
- name: "struktur.smart_merge",
169
- kind: "CHAIN",
170
- parentSpan: strategySpan,
171
- attributes: {
172
- "merge.strategy": "smart",
173
- "merge.input_count": results.length,
174
- },
175
- });
176
-
177
- for (let i = 0; i < results.length; i++) {
178
- const result = results[i]!;
179
- const prevSize = Object.keys(merged).length;
180
- merged = merger.merge(merged, result.data as Record<string, unknown>);
181
- const newSize = Object.keys(merged).length;
182
-
183
- // Log merge operation per field
184
- for (const key of Object.keys(result.data as Record<string, unknown>)) {
185
- const leftArray = Array.isArray(merged[key])
186
- ? (merged[key] as unknown[]).length
187
- : undefined;
188
- const rightArray = Array.isArray(
189
- (result.data as Record<string, unknown>)[key],
190
- )
191
- ? ((result.data as Record<string, unknown>)[key] as unknown[]).length
192
- : undefined;
193
-
194
- debug?.smartMergeField({
195
- mergeId: "parallel_auto_smart_merge",
196
- field: key,
197
- operation: "merge_arrays",
198
- leftCount: leftArray,
199
- rightCount: rightArray,
200
- });
201
-
202
- // Record merge event in telemetry
203
- if (mergeSpan && telemetry) {
204
- telemetry.recordEvent(mergeSpan, {
205
- type: "merge",
206
- strategy: "smart",
207
- inputCount: rightArray ?? 1,
208
- outputCount: leftArray ?? 1,
209
- });
210
- }
211
- }
212
- }
213
-
214
- debug?.mergeComplete({
215
- mergeId: "parallel_auto_smart_merge",
216
- success: true,
217
- });
218
-
219
- // End merge span
220
- if (mergeSpan && telemetry) {
221
- telemetry.endSpan(mergeSpan, {
222
- status: "ok",
223
- output: merged,
224
- });
225
- }
226
-
227
- merged = dedupeArrays(merged);
228
-
229
- // Create exact dedupe span
230
- const exactDedupeSpan = telemetry?.startSpan({
231
- name: "struktur.exact_dedupe",
232
- kind: "CHAIN",
233
- parentSpan: strategySpan,
234
- attributes: {
235
- "dedupe.method": "exact_hashing",
236
- },
237
- });
238
-
239
- // End exact dedupe span
240
- if (exactDedupeSpan && telemetry) {
241
- telemetry.recordEvent(exactDedupeSpan, {
242
- type: "merge",
243
- strategy: "exact_hash_dedupe",
244
- inputCount: Object.keys(merged).length,
245
- outputCount: Object.keys(merged).length,
246
- });
247
- telemetry.endSpan(exactDedupeSpan, {
248
- status: "ok",
249
- output: merged,
250
- });
251
- }
252
-
253
- const dedupePrompt = buildDeduplicationPrompt(schema, merged);
254
-
255
- debug?.dedupeStart({
256
- dedupeId: "parallel_auto_dedupe",
257
- itemCount: Object.keys(merged).length,
258
- });
259
-
260
- // Create LLM dedupe span
261
- const llmDedupeSpan = telemetry?.startSpan({
262
- name: "struktur.llm_dedupe",
263
- kind: "CHAIN",
264
- parentSpan: strategySpan,
265
- attributes: {
266
- "dedupe.method": "llm",
267
- },
268
- });
269
-
270
- const dedupeResponse = await runWithRetries<{ keys: string[] }>({
271
- model: this.config.dedupeModel ?? this.config.model,
272
- schema: dedupeSchema,
273
- system: dedupePrompt.system,
274
- user: dedupePrompt.user,
275
- events: options.events,
276
- execute: this.config.dedupeExecute,
277
- strict: this.config.strict,
278
- debug,
279
- callId: "parallel_auto_dedupe",
280
- telemetry: telemetry ?? undefined,
281
- parentSpan: llmDedupeSpan,
282
- });
283
-
284
- step += 1;
285
- await options.events?.onStep?.({
286
- step,
287
- total: totalSteps,
288
- label: "dedupe",
289
- });
290
- debug?.step({
291
- step,
292
- total: totalSteps,
293
- label: "dedupe",
294
- strategy: this.name,
295
- });
296
-
297
- let deduped = merged;
298
- for (const key of dedupeResponse.data.keys) {
299
- deduped = removeByPath(deduped, key);
300
- }
301
-
302
- debug?.dedupeComplete({
303
- dedupeId: "parallel_auto_dedupe",
304
- duplicatesFound: dedupeResponse.data.keys.length,
305
- itemsRemoved: dedupeResponse.data.keys.length,
306
- });
307
-
308
- // End LLM dedupe span
309
- if (llmDedupeSpan && telemetry) {
310
- telemetry.recordEvent(llmDedupeSpan, {
311
- type: "merge",
312
- strategy: "llm_dedupe",
313
- inputCount: Object.keys(merged).length,
314
- outputCount: Object.keys(deduped).length,
315
- deduped: dedupeResponse.data.keys.length,
316
- });
317
- telemetry.endSpan(llmDedupeSpan, {
318
- status: "ok",
319
- output: deduped,
320
- });
321
- }
322
-
323
- // End strategy span
324
- telemetry?.endSpan(strategySpan!, {
325
- status: "ok",
326
- output: deduped,
327
- });
328
-
329
- return {
330
- data: deduped as T,
331
- usage: mergeUsage([...results.map((r) => r.usage), dedupeResponse.usage]),
332
- };
333
- }
334
- }
335
-
336
- export const parallelAutoMerge = <T>(
337
- config: ParallelAutoMergeStrategyConfig,
338
- ) => {
339
- return new ParallelAutoMergeStrategy<T>(config);
340
- };
341
-
342
- export const __testing__ = {
343
- dedupeArrays,
344
- removeByPath,
345
- };
@@ -1,61 +0,0 @@
1
- import { test, expect } from "bun:test";
2
- import type { JSONSchemaType } from "ajv";
3
- import { ParallelStrategy } from "./ParallelStrategy";
4
- import type { Artifact, ExtractionOptions } from "../types";
5
-
6
- type Output = { title: string };
7
-
8
- const schema: JSONSchemaType<Output> = {
9
- type: "object",
10
- properties: { title: { type: "string" } },
11
- required: ["title"],
12
- additionalProperties: false,
13
- };
14
-
15
- const artifacts: Artifact[] = [
16
- {
17
- id: "a1",
18
- type: "text",
19
- raw: async () => Buffer.from(""),
20
- contents: [{ text: "abcdefgh" }],
21
- },
22
- {
23
- id: "a2",
24
- type: "text",
25
- raw: async () => Buffer.from(""),
26
- contents: [{ text: "abcdefgh" }],
27
- },
28
- ];
29
-
30
- test("ParallelStrategy merges batch results", async () => {
31
- let calls = 0;
32
- const strategy = new ParallelStrategy<Output>({
33
- model: {},
34
- mergeModel: {},
35
- chunkSize: 2,
36
- execute: (async (request: any) => {
37
- calls += 1;
38
- const userText = typeof request.user === "string" ? request.user : "";
39
- if (userText.includes("<json-objects>")) {
40
- return {
41
- data: { title: "merged" },
42
- usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 },
43
- };
44
- }
45
- return {
46
- data: { title: "chunk" },
47
- usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 },
48
- };
49
- }) as any,
50
- });
51
-
52
- const options: ExtractionOptions<Output> = {
53
- artifacts,
54
- schema,
55
- strategy,
56
- };
57
-
58
- const result = await strategy.run(options);
59
- expect(result.data.title).toBe("merged");
60
- expect(calls).toBe(3);
61
- });
@@ -1,208 +0,0 @@
1
- import type { ExtractionResult, ExtractionStrategy } from "../types";
2
- import type { ExtractionOptions } from "../types";
3
- import { buildExtractorPrompt } from "../prompts/ExtractorPrompt";
4
- import { buildParallelMergerPrompt } from "../prompts/ParallelMergerPrompt";
5
- import {
6
- extractWithPrompt,
7
- getBatches,
8
- mergeUsage,
9
- serializeSchema,
10
- } from "./utils";
11
- import { runConcurrently } from "./concurrency";
12
- import { runWithRetries } from "../llm/RetryingRunner";
13
-
14
- export type ParallelStrategyConfig = {
15
- model: unknown;
16
- mergeModel: unknown;
17
- chunkSize: number;
18
- concurrency?: number;
19
- maxImages?: number;
20
- outputInstructions?: string;
21
- execute?: typeof runWithRetries;
22
- strict?: boolean;
23
- };
24
-
25
- export class ParallelStrategy<T> implements ExtractionStrategy<T> {
26
- public name = "parallel";
27
- private config: ParallelStrategyConfig;
28
-
29
- constructor(config: ParallelStrategyConfig) {
30
- this.config = config;
31
- }
32
-
33
- getEstimatedSteps(artifacts: ExtractionOptions<T>["artifacts"]): number {
34
- const batches = getBatches(artifacts, {
35
- maxTokens: this.config.chunkSize,
36
- maxImages: this.config.maxImages,
37
- });
38
- return batches.length + 3;
39
- }
40
-
41
- async run(options: ExtractionOptions<T>): Promise<ExtractionResult<T>> {
42
- const debug = options.debug;
43
- const { telemetry } = options;
44
-
45
- // Create strategy-level span
46
- const strategySpan = telemetry?.startSpan({
47
- name: "strategy.parallel",
48
- kind: "CHAIN",
49
- attributes: {
50
- "strategy.name": this.name,
51
- "strategy.artifacts.count": options.artifacts.length,
52
- "strategy.chunk_size": this.config.chunkSize,
53
- "strategy.concurrency": this.config.concurrency,
54
- },
55
- });
56
-
57
- const batches = getBatches(
58
- options.artifacts,
59
- {
60
- maxTokens: this.config.chunkSize,
61
- maxImages: this.config.maxImages,
62
- },
63
- debug,
64
- telemetry ?? undefined,
65
- strategySpan,
66
- );
67
-
68
- const schema = serializeSchema(options.schema);
69
- const totalSteps = this.getEstimatedSteps(options.artifacts);
70
- let step = 1;
71
-
72
- // Emit start event
73
- await options.events?.onStep?.({
74
- step,
75
- total: totalSteps,
76
- label: batches.length > 1 ? `batch 1/${batches.length}` : "extract",
77
- });
78
- debug?.step({
79
- step,
80
- total: totalSteps,
81
- label: batches.length > 1 ? `batch 1/${batches.length}` : "extract",
82
- strategy: this.name,
83
- });
84
-
85
- const tasks = batches.map((batch, index) => async () => {
86
- const prompt = buildExtractorPrompt(
87
- batch,
88
- schema,
89
- this.config.outputInstructions,
90
- );
91
- const result = await extractWithPrompt<T>({
92
- model: this.config.model,
93
- schema: options.schema,
94
- system: prompt.system,
95
- user: prompt.user,
96
- artifacts: batch,
97
- events: options.events,
98
- execute: this.config.execute as never,
99
- strict: options.strict ?? this.config.strict,
100
- debug,
101
- callId: `parallel_batch_${index + 1}`,
102
- telemetry: telemetry ?? undefined,
103
- parentSpan: strategySpan,
104
- });
105
- // Emit progress after batch completes (if there are more batches)
106
- const completedIndex = index + 1;
107
- if (completedIndex < batches.length) {
108
- step += 1;
109
- await options.events?.onStep?.({
110
- step,
111
- total: totalSteps,
112
- label: `batch ${completedIndex + 1}/${batches.length}`,
113
- });
114
- debug?.step({
115
- step,
116
- total: totalSteps,
117
- label: `batch ${completedIndex + 1}/${batches.length}`,
118
- strategy: this.name,
119
- });
120
- }
121
- return result;
122
- });
123
-
124
- const results = await runConcurrently(
125
- tasks,
126
- this.config.concurrency ?? batches.length,
127
- );
128
-
129
- debug?.mergeStart({
130
- mergeId: "parallel_merge",
131
- inputCount: results.length,
132
- strategy: this.name,
133
- });
134
-
135
- // Create merge span
136
- const mergeSpan = telemetry?.startSpan({
137
- name: "struktur.merge",
138
- kind: "CHAIN",
139
- parentSpan: strategySpan,
140
- attributes: {
141
- "merge.strategy": "parallel",
142
- "merge.input_count": results.length,
143
- },
144
- });
145
-
146
- const mergePrompt = buildParallelMergerPrompt(
147
- schema,
148
- results.map((r) => r.data),
149
- );
150
- const merged = await extractWithPrompt<T>({
151
- model: this.config.mergeModel,
152
- schema: options.schema,
153
- system: mergePrompt.system,
154
- user: mergePrompt.user,
155
- artifacts: [],
156
- events: options.events,
157
- execute: this.config.execute as never,
158
- strict: this.config.strict,
159
- debug,
160
- callId: "parallel_merge",
161
- telemetry: telemetry ?? undefined,
162
- parentSpan: mergeSpan,
163
- });
164
-
165
- step += 1;
166
- await options.events?.onStep?.({
167
- step,
168
- total: totalSteps,
169
- label: "merge",
170
- });
171
- debug?.step({
172
- step,
173
- total: totalSteps,
174
- label: "merge",
175
- strategy: this.name,
176
- });
177
- debug?.mergeComplete({ mergeId: "parallel_merge", success: true });
178
-
179
- // End merge span
180
- if (mergeSpan && telemetry) {
181
- telemetry.recordEvent(mergeSpan, {
182
- type: "merge",
183
- strategy: "parallel",
184
- inputCount: results.length,
185
- outputCount: 1,
186
- });
187
- telemetry.endSpan(mergeSpan, {
188
- status: "ok",
189
- output: merged.data,
190
- });
191
- }
192
-
193
- // End strategy span
194
- telemetry?.endSpan(strategySpan!, {
195
- status: "ok",
196
- output: merged.data,
197
- });
198
-
199
- return {
200
- data: merged.data,
201
- usage: mergeUsage([...results.map((r) => r.usage), merged.usage]),
202
- };
203
- }
204
- }
205
-
206
- export const parallel = <T>(config: ParallelStrategyConfig) => {
207
- return new ParallelStrategy<T>(config);
208
- };
@@ -1,66 +0,0 @@
1
- import { test, expect } from "bun:test";
2
- import type { JSONSchemaType } from "ajv";
3
- import { SequentialAutoMergeStrategy } from "./SequentialAutoMergeStrategy";
4
- import type { Artifact, ExtractionOptions } from "../types";
5
-
6
- type Output = { items: Array<{ id: number }> };
7
-
8
- const schema: JSONSchemaType<Output> = {
9
- type: "object",
10
- properties: {
11
- items: {
12
- type: "array",
13
- items: {
14
- type: "object",
15
- properties: { id: { type: "number" } },
16
- required: ["id"],
17
- additionalProperties: false,
18
- },
19
- },
20
- },
21
- required: ["items"],
22
- additionalProperties: false,
23
- };
24
-
25
- const artifacts: Artifact[] = [
26
- {
27
- id: "a1",
28
- type: "text",
29
- raw: async () => Buffer.from(""),
30
- contents: [{ text: "abcdefgh" }],
31
- },
32
- {
33
- id: "a2",
34
- type: "text",
35
- raw: async () => Buffer.from(""),
36
- contents: [{ text: "abcdefgh" }],
37
- },
38
- ];
39
-
40
- test("SequentialAutoMergeStrategy merges and dedupes", async () => {
41
- const strategy = new SequentialAutoMergeStrategy<Output>({
42
- model: {},
43
- chunkSize: 2,
44
- execute: (async () => {
45
- return {
46
- data: { items: [{ id: 1 }, { id: 1 }] },
47
- usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 },
48
- };
49
- }) as any,
50
- dedupeExecute: (async () => {
51
- return {
52
- data: { keys: [] },
53
- usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 },
54
- };
55
- }) as any,
56
- });
57
-
58
- const options: ExtractionOptions<Output> = {
59
- artifacts,
60
- schema,
61
- strategy,
62
- };
63
-
64
- const result = await strategy.run(options);
65
- expect(result.data.items.length).toBe(1);
66
- });