@struktur/sdk 2.1.2 → 2.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. package/dist/index.js +4111 -0
  2. package/dist/index.js.map +1 -0
  3. package/dist/parsers.js +492 -0
  4. package/dist/parsers.js.map +1 -0
  5. package/dist/strategies.js +2435 -0
  6. package/dist/strategies.js.map +1 -0
  7. package/package.json +24 -12
  8. package/src/agent-cli-integration.test.ts +0 -47
  9. package/src/agent-export.test.ts +0 -17
  10. package/src/agent-tool-labels.test.ts +0 -50
  11. package/src/artifacts/AGENTS.md +0 -16
  12. package/src/artifacts/fileToArtifact.test.ts +0 -37
  13. package/src/artifacts/fileToArtifact.ts +0 -44
  14. package/src/artifacts/input.test.ts +0 -243
  15. package/src/artifacts/input.ts +0 -360
  16. package/src/artifacts/providers.test.ts +0 -19
  17. package/src/artifacts/providers.ts +0 -7
  18. package/src/artifacts/urlToArtifact.test.ts +0 -23
  19. package/src/artifacts/urlToArtifact.ts +0 -19
  20. package/src/auth/AGENTS.md +0 -11
  21. package/src/auth/config.test.ts +0 -132
  22. package/src/auth/config.ts +0 -186
  23. package/src/auth/tokens.test.ts +0 -58
  24. package/src/auth/tokens.ts +0 -229
  25. package/src/chunking/AGENTS.md +0 -11
  26. package/src/chunking/ArtifactBatcher.test.ts +0 -22
  27. package/src/chunking/ArtifactBatcher.ts +0 -110
  28. package/src/chunking/ArtifactSplitter.test.ts +0 -38
  29. package/src/chunking/ArtifactSplitter.ts +0 -151
  30. package/src/debug/AGENTS.md +0 -79
  31. package/src/debug/logger.test.ts +0 -244
  32. package/src/debug/logger.ts +0 -211
  33. package/src/extract.test.ts +0 -22
  34. package/src/extract.ts +0 -150
  35. package/src/fields.test.ts +0 -681
  36. package/src/fields.ts +0 -246
  37. package/src/index.test.ts +0 -20
  38. package/src/index.ts +0 -110
  39. package/src/llm/AGENTS.md +0 -9
  40. package/src/llm/LLMClient.test.ts +0 -394
  41. package/src/llm/LLMClient.ts +0 -264
  42. package/src/llm/RetryingRunner.test.ts +0 -174
  43. package/src/llm/RetryingRunner.ts +0 -270
  44. package/src/llm/message.test.ts +0 -42
  45. package/src/llm/message.ts +0 -47
  46. package/src/llm/models.test.ts +0 -82
  47. package/src/llm/models.ts +0 -190
  48. package/src/llm/resolveModel.ts +0 -86
  49. package/src/merge/AGENTS.md +0 -6
  50. package/src/merge/Deduplicator.test.ts +0 -108
  51. package/src/merge/Deduplicator.ts +0 -45
  52. package/src/merge/SmartDataMerger.test.ts +0 -177
  53. package/src/merge/SmartDataMerger.ts +0 -56
  54. package/src/parsers/AGENTS.md +0 -58
  55. package/src/parsers/collect.test.ts +0 -56
  56. package/src/parsers/collect.ts +0 -31
  57. package/src/parsers/index.ts +0 -6
  58. package/src/parsers/mime.test.ts +0 -91
  59. package/src/parsers/mime.ts +0 -137
  60. package/src/parsers/npm.ts +0 -26
  61. package/src/parsers/pdf.test.ts +0 -394
  62. package/src/parsers/pdf.ts +0 -194
  63. package/src/parsers/runner.test.ts +0 -95
  64. package/src/parsers/runner.ts +0 -177
  65. package/src/parsers/types.ts +0 -29
  66. package/src/prompts/AGENTS.md +0 -8
  67. package/src/prompts/DeduplicationPrompt.test.ts +0 -41
  68. package/src/prompts/DeduplicationPrompt.ts +0 -37
  69. package/src/prompts/ExtractorPrompt.test.ts +0 -21
  70. package/src/prompts/ExtractorPrompt.ts +0 -72
  71. package/src/prompts/ParallelMergerPrompt.test.ts +0 -8
  72. package/src/prompts/ParallelMergerPrompt.ts +0 -37
  73. package/src/prompts/SequentialExtractorPrompt.test.ts +0 -24
  74. package/src/prompts/SequentialExtractorPrompt.ts +0 -82
  75. package/src/prompts/formatArtifacts.test.ts +0 -39
  76. package/src/prompts/formatArtifacts.ts +0 -46
  77. package/src/strategies/AGENTS.md +0 -6
  78. package/src/strategies/DoublePassAutoMergeStrategy.test.ts +0 -53
  79. package/src/strategies/DoublePassAutoMergeStrategy.ts +0 -410
  80. package/src/strategies/DoublePassStrategy.test.ts +0 -48
  81. package/src/strategies/DoublePassStrategy.ts +0 -266
  82. package/src/strategies/ParallelAutoMergeStrategy.test.ts +0 -152
  83. package/src/strategies/ParallelAutoMergeStrategy.ts +0 -345
  84. package/src/strategies/ParallelStrategy.test.ts +0 -61
  85. package/src/strategies/ParallelStrategy.ts +0 -208
  86. package/src/strategies/SequentialAutoMergeStrategy.test.ts +0 -66
  87. package/src/strategies/SequentialAutoMergeStrategy.ts +0 -325
  88. package/src/strategies/SequentialStrategy.test.ts +0 -53
  89. package/src/strategies/SequentialStrategy.ts +0 -142
  90. package/src/strategies/SimpleStrategy.test.ts +0 -46
  91. package/src/strategies/SimpleStrategy.ts +0 -94
  92. package/src/strategies/concurrency.test.ts +0 -16
  93. package/src/strategies/concurrency.ts +0 -14
  94. package/src/strategies/index.test.ts +0 -20
  95. package/src/strategies/index.ts +0 -7
  96. package/src/strategies/utils.test.ts +0 -76
  97. package/src/strategies/utils.ts +0 -95
  98. package/src/tokenization.test.ts +0 -119
  99. package/src/tokenization.ts +0 -71
  100. package/src/types.test.ts +0 -25
  101. package/src/types.ts +0 -174
  102. package/src/validation/AGENTS.md +0 -7
  103. package/src/validation/validator.test.ts +0 -204
  104. package/src/validation/validator.ts +0 -90
  105. package/tsconfig.json +0 -22
@@ -1,266 +0,0 @@
1
- import type { ExtractionResult, ExtractionStrategy } from "../types";
2
- import type { ExtractionOptions } from "../types";
3
- import { buildExtractorPrompt } from "../prompts/ExtractorPrompt";
4
- import { buildParallelMergerPrompt } from "../prompts/ParallelMergerPrompt";
5
- import { buildSequentialPrompt } from "../prompts/SequentialExtractorPrompt";
6
- import {
7
- extractWithPrompt,
8
- getBatches,
9
- mergeUsage,
10
- serializeSchema,
11
- } from "./utils";
12
- import { runConcurrently } from "./concurrency";
13
- import { runWithRetries } from "../llm/RetryingRunner";
14
-
15
- export type DoublePassStrategyConfig = {
16
- model: unknown;
17
- mergeModel: unknown;
18
- chunkSize: number;
19
- concurrency?: number;
20
- maxImages?: number;
21
- outputInstructions?: string;
22
- execute?: typeof runWithRetries;
23
- strict?: boolean;
24
- };
25
-
26
- export class DoublePassStrategy<T> implements ExtractionStrategy<T> {
27
- public name = "double-pass";
28
- private config: DoublePassStrategyConfig;
29
-
30
- constructor(config: DoublePassStrategyConfig) {
31
- this.config = config;
32
- }
33
-
34
- getEstimatedSteps(artifacts: ExtractionOptions<T>["artifacts"]): number {
35
- const batches = getBatches(artifacts, {
36
- maxTokens: this.config.chunkSize,
37
- maxImages: this.config.maxImages,
38
- });
39
- return batches.length * 2 + 3;
40
- }
41
-
42
- async run(options: ExtractionOptions<T>): Promise<ExtractionResult<T>> {
43
- const debug = options.debug;
44
- const { telemetry } = options;
45
-
46
- // Create strategy-level span
47
- const strategySpan = telemetry?.startSpan({
48
- name: "strategy.double-pass",
49
- kind: "CHAIN",
50
- attributes: {
51
- "strategy.name": this.name,
52
- "strategy.artifacts.count": options.artifacts.length,
53
- "strategy.chunk_size": this.config.chunkSize,
54
- "strategy.concurrency": this.config.concurrency,
55
- },
56
- });
57
-
58
- const batches = getBatches(
59
- options.artifacts,
60
- {
61
- maxTokens: this.config.chunkSize,
62
- maxImages: this.config.maxImages,
63
- },
64
- debug,
65
- telemetry ?? undefined,
66
- strategySpan,
67
- );
68
-
69
- const schema = serializeSchema(options.schema);
70
- const totalSteps = this.getEstimatedSteps(options.artifacts);
71
- let step = 1;
72
-
73
- // Create pass 1 span
74
- const pass1Span = telemetry?.startSpan({
75
- name: "struktur.pass_1",
76
- kind: "CHAIN",
77
- parentSpan: strategySpan,
78
- attributes: {
79
- "pass.number": 1,
80
- "pass.type": "parallel_extraction",
81
- },
82
- });
83
-
84
- const tasks = batches.map((batch, index) => async () => {
85
- const prompt = buildExtractorPrompt(
86
- batch,
87
- schema,
88
- this.config.outputInstructions,
89
- );
90
- const result = await extractWithPrompt<T>({
91
- model: this.config.model,
92
- schema: options.schema,
93
- system: prompt.system,
94
- user: prompt.user,
95
- artifacts: batch,
96
- events: options.events,
97
- execute: this.config.execute as never,
98
- strict: options.strict ?? this.config.strict,
99
- debug,
100
- callId: `double_pass_1_batch_${index + 1}`,
101
- telemetry: telemetry ?? undefined,
102
- parentSpan: pass1Span,
103
- });
104
- step += 1;
105
- await options.events?.onStep?.({
106
- step,
107
- total: totalSteps,
108
- label: `pass 1 batch ${index + 1}/${batches.length}`,
109
- });
110
- debug?.step({
111
- step,
112
- total: totalSteps,
113
- label: `pass 1 batch ${index + 1}/${batches.length}`,
114
- strategy: this.name,
115
- });
116
- return result;
117
- });
118
-
119
- const results = await runConcurrently(
120
- tasks,
121
- this.config.concurrency ?? batches.length,
122
- );
123
-
124
- debug?.mergeStart({
125
- mergeId: "double_pass_1_merge",
126
- inputCount: results.length,
127
- strategy: this.name,
128
- });
129
-
130
- // Create pass 1 merge span
131
- const pass1MergeSpan = telemetry?.startSpan({
132
- name: "struktur.pass_1_merge",
133
- kind: "CHAIN",
134
- parentSpan: pass1Span,
135
- attributes: {
136
- "merge.strategy": "parallel",
137
- "merge.input_count": results.length,
138
- },
139
- });
140
-
141
- const mergePrompt = buildParallelMergerPrompt(
142
- schema,
143
- results.map((r) => r.data),
144
- );
145
- const merged = await extractWithPrompt<T>({
146
- model: this.config.mergeModel,
147
- schema: options.schema,
148
- system: mergePrompt.system,
149
- user: mergePrompt.user,
150
- artifacts: [],
151
- events: options.events,
152
- execute: this.config.execute as never,
153
- strict: this.config.strict,
154
- debug,
155
- callId: "double_pass_1_merge",
156
- telemetry: telemetry ?? undefined,
157
- parentSpan: pass1MergeSpan,
158
- });
159
-
160
- step += 1;
161
- await options.events?.onStep?.({
162
- step,
163
- total: totalSteps,
164
- label: "pass 1 merge",
165
- });
166
- debug?.step({
167
- step,
168
- total: totalSteps,
169
- label: "pass 1 merge",
170
- strategy: this.name,
171
- });
172
- debug?.mergeComplete({ mergeId: "double_pass_1_merge", success: true });
173
-
174
- // End pass 1 merge span
175
- if (pass1MergeSpan && telemetry) {
176
- telemetry.recordEvent(pass1MergeSpan, {
177
- type: "merge",
178
- strategy: "parallel",
179
- inputCount: results.length,
180
- outputCount: 1,
181
- });
182
- telemetry.endSpan(pass1MergeSpan, {
183
- status: "ok",
184
- output: merged.data,
185
- });
186
- }
187
-
188
- // End pass 1 span
189
- telemetry?.endSpan(pass1Span!, {
190
- status: "ok",
191
- output: merged.data,
192
- });
193
-
194
- // Create pass 2 span
195
- const pass2Span = telemetry?.startSpan({
196
- name: "struktur.pass_2",
197
- kind: "CHAIN",
198
- parentSpan: strategySpan,
199
- attributes: {
200
- "pass.number": 2,
201
- "pass.type": "sequential_refinement",
202
- },
203
- });
204
-
205
- let currentData = merged.data;
206
- const usages = [...results.map((r) => r.usage), merged.usage];
207
-
208
- for (const [index, batch] of batches.entries()) {
209
- const prompt = buildSequentialPrompt(
210
- batch,
211
- schema,
212
- JSON.stringify(currentData),
213
- this.config.outputInstructions,
214
- );
215
-
216
- const result = await extractWithPrompt<T>({
217
- model: this.config.model,
218
- schema: options.schema,
219
- system: prompt.system,
220
- user: prompt.user,
221
- artifacts: batch,
222
- events: options.events,
223
- execute: this.config.execute as never,
224
- strict: this.config.strict,
225
- debug,
226
- callId: `double_pass_2_batch_${index + 1}`,
227
- telemetry: telemetry ?? undefined,
228
- parentSpan: pass2Span,
229
- });
230
-
231
- currentData = result.data;
232
- usages.push(result.usage);
233
-
234
- step += 1;
235
- await options.events?.onStep?.({
236
- step,
237
- total: totalSteps,
238
- label: `pass 2 batch ${index + 1}/${batches.length}`,
239
- });
240
- debug?.step({
241
- step,
242
- total: totalSteps,
243
- label: `pass 2 batch ${index + 1}/${batches.length}`,
244
- strategy: this.name,
245
- });
246
- }
247
-
248
- // End pass 2 span
249
- telemetry?.endSpan(pass2Span!, {
250
- status: "ok",
251
- output: currentData,
252
- });
253
-
254
- // End strategy span
255
- telemetry?.endSpan(strategySpan!, {
256
- status: "ok",
257
- output: currentData,
258
- });
259
-
260
- return { data: currentData, usage: mergeUsage(usages) };
261
- }
262
- }
263
-
264
- export const doublePass = <T>(config: DoublePassStrategyConfig) => {
265
- return new DoublePassStrategy<T>(config);
266
- };
@@ -1,152 +0,0 @@
1
- import { test, expect } from "bun:test";
2
- import type { JSONSchemaType } from "ajv";
3
- import { ParallelAutoMergeStrategy, __testing__ } from "./ParallelAutoMergeStrategy";
4
- import type { Artifact, ExtractionOptions } from "../types";
5
-
6
- type Output = { items: Array<{ id: number }> };
7
-
8
- const schema: JSONSchemaType<Output> = {
9
- type: "object",
10
- properties: {
11
- items: {
12
- type: "array",
13
- items: {
14
- type: "object",
15
- properties: { id: { type: "number" } },
16
- required: ["id"],
17
- additionalProperties: false,
18
- },
19
- },
20
- },
21
- required: ["items"],
22
- additionalProperties: false,
23
- };
24
-
25
- const artifacts: Artifact[] = [
26
- {
27
- id: "a1",
28
- type: "text",
29
- raw: async () => Buffer.from(""),
30
- contents: [{ text: "abcdefgh" }],
31
- },
32
- {
33
- id: "a2",
34
- type: "text",
35
- raw: async () => Buffer.from(""),
36
- contents: [{ text: "abcdefgh" }],
37
- },
38
- ];
39
-
40
- test("ParallelAutoMergeStrategy deduplicates arrays", async () => {
41
- const strategy = new ParallelAutoMergeStrategy<Output>({
42
- model: {},
43
- chunkSize: 2,
44
- execute: (async () => {
45
- return {
46
- data: { items: [{ id: 1 }, { id: 1 }] },
47
- usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 },
48
- };
49
- }) as any,
50
- dedupeExecute: (async () => {
51
- return {
52
- data: { keys: [] },
53
- usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 },
54
- };
55
- }) as any,
56
- });
57
-
58
- const options: ExtractionOptions<Output> = {
59
- artifacts,
60
- schema,
61
- strategy,
62
- };
63
-
64
- const result = await strategy.run(options);
65
- expect(result.data.items.length).toBe(1);
66
- });
67
-
68
- test("dedupeArrays removes duplicates from all array fields", () => {
69
- const data = {
70
- items: [{ id: 1 }, { id: 1 }, { id: 2 }],
71
- names: ["a", "a", "b"],
72
- count: 5,
73
- };
74
-
75
- const result = __testing__.dedupeArrays(data);
76
-
77
- expect(result.items).toEqual([{ id: 1 }, { id: 2 }]);
78
- expect(result.names).toEqual(["a", "b"]);
79
- expect(result.count).toBe(5);
80
- });
81
-
82
- test("dedupeArrays handles non-array fields", () => {
83
- const data = {
84
- title: "test",
85
- count: 42,
86
- };
87
-
88
- const result = __testing__.dedupeArrays(data);
89
-
90
- expect(result).toEqual({ title: "test", count: 42 });
91
- });
92
-
93
- test("removeByPath removes item at path", () => {
94
- const data = {
95
- items: [{ id: 1 }, { id: 2 }, { id: 3 }],
96
- };
97
-
98
- const result = __testing__.removeByPath(data, "items.1");
99
-
100
- expect(result.items).toEqual([{ id: 1 }, { id: 3 }]);
101
- });
102
-
103
- test("removeByPath handles first item", () => {
104
- const data = {
105
- items: [{ id: 1 }, { id: 2 }],
106
- };
107
-
108
- const result = __testing__.removeByPath(data, "items.0");
109
-
110
- expect(result.items).toEqual([{ id: 2 }]);
111
- });
112
-
113
- test("removeByPath handles last item", () => {
114
- const data = {
115
- items: [{ id: 1 }, { id: 2 }],
116
- };
117
-
118
- const result = __testing__.removeByPath(data, "items.1");
119
-
120
- expect(result.items).toEqual([{ id: 1 }]);
121
- });
122
-
123
- test("removeByPath returns unchanged data for invalid path", () => {
124
- const data = {
125
- items: [{ id: 1 }],
126
- };
127
-
128
- expect(__testing__.removeByPath(data, "")).toEqual(data);
129
- expect(__testing__.removeByPath(data, "items")).toEqual(data);
130
- expect(__testing__.removeByPath(data, "items.abc")).toEqual(data);
131
- expect(__testing__.removeByPath(data, "missing.0")).toEqual(data);
132
- });
133
-
134
- test("removeByPath returns unchanged data for non-array field", () => {
135
- const data = {
136
- title: "test",
137
- };
138
-
139
- const result = __testing__.removeByPath(data, "title.0");
140
-
141
- expect(result).toEqual(data);
142
- });
143
-
144
- test("removeByPath does not mutate original data", () => {
145
- const data = {
146
- items: [{ id: 1 }, { id: 2 }],
147
- };
148
-
149
- __testing__.removeByPath(data, "items.0");
150
-
151
- expect(data.items).toEqual([{ id: 1 }, { id: 2 }]);
152
- });