@struktur/sdk 2.1.1 → 2.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. package/dist/index.js +4111 -0
  2. package/dist/index.js.map +1 -0
  3. package/dist/parsers.js +492 -0
  4. package/dist/parsers.js.map +1 -0
  5. package/dist/strategies.js +2435 -0
  6. package/dist/strategies.js.map +1 -0
  7. package/package.json +25 -13
  8. package/src/agent-cli-integration.test.ts +0 -47
  9. package/src/agent-export.test.ts +0 -17
  10. package/src/agent-tool-labels.test.ts +0 -50
  11. package/src/artifacts/AGENTS.md +0 -16
  12. package/src/artifacts/fileToArtifact.test.ts +0 -37
  13. package/src/artifacts/fileToArtifact.ts +0 -44
  14. package/src/artifacts/input.test.ts +0 -243
  15. package/src/artifacts/input.ts +0 -360
  16. package/src/artifacts/providers.test.ts +0 -19
  17. package/src/artifacts/providers.ts +0 -7
  18. package/src/artifacts/urlToArtifact.test.ts +0 -23
  19. package/src/artifacts/urlToArtifact.ts +0 -19
  20. package/src/auth/AGENTS.md +0 -11
  21. package/src/auth/config.test.ts +0 -132
  22. package/src/auth/config.ts +0 -186
  23. package/src/auth/tokens.test.ts +0 -58
  24. package/src/auth/tokens.ts +0 -229
  25. package/src/chunking/AGENTS.md +0 -11
  26. package/src/chunking/ArtifactBatcher.test.ts +0 -22
  27. package/src/chunking/ArtifactBatcher.ts +0 -110
  28. package/src/chunking/ArtifactSplitter.test.ts +0 -38
  29. package/src/chunking/ArtifactSplitter.ts +0 -151
  30. package/src/debug/AGENTS.md +0 -79
  31. package/src/debug/logger.test.ts +0 -244
  32. package/src/debug/logger.ts +0 -211
  33. package/src/extract.test.ts +0 -22
  34. package/src/extract.ts +0 -150
  35. package/src/fields.test.ts +0 -681
  36. package/src/fields.ts +0 -246
  37. package/src/index.test.ts +0 -20
  38. package/src/index.ts +0 -110
  39. package/src/llm/AGENTS.md +0 -9
  40. package/src/llm/LLMClient.test.ts +0 -394
  41. package/src/llm/LLMClient.ts +0 -264
  42. package/src/llm/RetryingRunner.test.ts +0 -174
  43. package/src/llm/RetryingRunner.ts +0 -270
  44. package/src/llm/message.test.ts +0 -42
  45. package/src/llm/message.ts +0 -47
  46. package/src/llm/models.test.ts +0 -82
  47. package/src/llm/models.ts +0 -190
  48. package/src/llm/resolveModel.ts +0 -86
  49. package/src/merge/AGENTS.md +0 -6
  50. package/src/merge/Deduplicator.test.ts +0 -108
  51. package/src/merge/Deduplicator.ts +0 -45
  52. package/src/merge/SmartDataMerger.test.ts +0 -177
  53. package/src/merge/SmartDataMerger.ts +0 -56
  54. package/src/parsers/AGENTS.md +0 -58
  55. package/src/parsers/collect.test.ts +0 -56
  56. package/src/parsers/collect.ts +0 -31
  57. package/src/parsers/index.ts +0 -6
  58. package/src/parsers/mime.test.ts +0 -91
  59. package/src/parsers/mime.ts +0 -137
  60. package/src/parsers/npm.ts +0 -26
  61. package/src/parsers/pdf.test.ts +0 -394
  62. package/src/parsers/pdf.ts +0 -194
  63. package/src/parsers/runner.test.ts +0 -95
  64. package/src/parsers/runner.ts +0 -177
  65. package/src/parsers/types.ts +0 -29
  66. package/src/prompts/AGENTS.md +0 -8
  67. package/src/prompts/DeduplicationPrompt.test.ts +0 -41
  68. package/src/prompts/DeduplicationPrompt.ts +0 -37
  69. package/src/prompts/ExtractorPrompt.test.ts +0 -21
  70. package/src/prompts/ExtractorPrompt.ts +0 -72
  71. package/src/prompts/ParallelMergerPrompt.test.ts +0 -8
  72. package/src/prompts/ParallelMergerPrompt.ts +0 -37
  73. package/src/prompts/SequentialExtractorPrompt.test.ts +0 -24
  74. package/src/prompts/SequentialExtractorPrompt.ts +0 -82
  75. package/src/prompts/formatArtifacts.test.ts +0 -39
  76. package/src/prompts/formatArtifacts.ts +0 -46
  77. package/src/strategies/AGENTS.md +0 -6
  78. package/src/strategies/DoublePassAutoMergeStrategy.test.ts +0 -53
  79. package/src/strategies/DoublePassAutoMergeStrategy.ts +0 -410
  80. package/src/strategies/DoublePassStrategy.test.ts +0 -48
  81. package/src/strategies/DoublePassStrategy.ts +0 -266
  82. package/src/strategies/ParallelAutoMergeStrategy.test.ts +0 -152
  83. package/src/strategies/ParallelAutoMergeStrategy.ts +0 -345
  84. package/src/strategies/ParallelStrategy.test.ts +0 -61
  85. package/src/strategies/ParallelStrategy.ts +0 -208
  86. package/src/strategies/SequentialAutoMergeStrategy.test.ts +0 -66
  87. package/src/strategies/SequentialAutoMergeStrategy.ts +0 -325
  88. package/src/strategies/SequentialStrategy.test.ts +0 -53
  89. package/src/strategies/SequentialStrategy.ts +0 -142
  90. package/src/strategies/SimpleStrategy.test.ts +0 -46
  91. package/src/strategies/SimpleStrategy.ts +0 -94
  92. package/src/strategies/concurrency.test.ts +0 -16
  93. package/src/strategies/concurrency.ts +0 -14
  94. package/src/strategies/index.test.ts +0 -20
  95. package/src/strategies/index.ts +0 -7
  96. package/src/strategies/utils.test.ts +0 -76
  97. package/src/strategies/utils.ts +0 -95
  98. package/src/tokenization.test.ts +0 -119
  99. package/src/tokenization.ts +0 -71
  100. package/src/types.test.ts +0 -25
  101. package/src/types.ts +0 -174
  102. package/src/validation/AGENTS.md +0 -7
  103. package/src/validation/validator.test.ts +0 -204
  104. package/src/validation/validator.ts +0 -90
  105. package/tsconfig.json +0 -22
@@ -1,211 +0,0 @@
1
- import type { Artifact, ArtifactContent, ExtractionEvents, Usage, StepInfo, ProgressInfo, RetryInfo, TokenUsageInfo } from "../types";
2
-
3
- export type DebugLogger = ReturnType<typeof createDebugLogger>;
4
-
5
- export const createDebugLogger = (enabled: boolean) => {
6
- const log = (entry: Record<string, unknown>) => {
7
- if (!enabled) return;
8
- const timestamp = new Date().toISOString();
9
- const logEntry = { timestamp, ...entry };
10
- process.stderr.write(JSON.stringify(logEntry) + "\n");
11
- };
12
-
13
- return {
14
- // CLI initialization
15
- cliInit: (data: { args: Record<string, unknown> }) => {
16
- log({ type: "cli_init", ...data });
17
- },
18
-
19
- schemaLoaded: (data: { source: string; schemaSize: number }) => {
20
- log({ type: "schema_loaded", ...data });
21
- },
22
-
23
- artifactsLoaded: (data: {
24
- count: number;
25
- artifacts: Array<{ id: string; type: string; contentCount: number; tokens?: number }>;
26
- totalTokens: number;
27
- totalImages: number;
28
- }) => {
29
- log({ type: "artifacts_loaded", ...data });
30
- },
31
-
32
- modelResolved: (data: { modelSpec: string; resolvedModel: string }) => {
33
- log({ type: "model_resolved", ...data });
34
- },
35
-
36
- strategyCreated: (data: { strategy: string; config: Record<string, unknown> }) => {
37
- log({ type: "strategy_created", ...data });
38
- },
39
-
40
- // Chunking
41
- chunkingStart: (data: {
42
- artifactId: string;
43
- totalTokens: number;
44
- maxTokens: number;
45
- maxImages?: number;
46
- }) => {
47
- log({ type: "chunking_start", ...data });
48
- },
49
-
50
- chunkingSplit: (data: {
51
- artifactId: string;
52
- originalContentCount: number;
53
- splitContentCount: number;
54
- splitReason: "text_too_long" | "content_limit";
55
- originalTokens: number;
56
- chunkSize: number;
57
- }) => {
58
- log({ type: "chunking_split", ...data });
59
- },
60
-
61
- chunkingResult: (data: {
62
- artifactId: string;
63
- chunksCreated: number;
64
- chunkSizes: number[];
65
- }) => {
66
- log({ type: "chunking_result", ...data });
67
- },
68
-
69
- batchingStart: (data: {
70
- totalArtifacts: number;
71
- maxTokens: number;
72
- maxImages?: number;
73
- modelMaxTokens?: number;
74
- effectiveMaxTokens: number;
75
- }) => {
76
- log({ type: "batching_start", ...data });
77
- },
78
-
79
- batchCreated: (data: {
80
- batchIndex: number;
81
- artifactCount: number;
82
- totalTokens: number;
83
- totalImages: number;
84
- artifactIds: string[];
85
- }) => {
86
- log({ type: "batch_created", ...data });
87
- },
88
-
89
- batchingComplete: (data: {
90
- totalBatches: number;
91
- batches: Array<{ index: number; artifactCount: number; tokens: number; images: number }>;
92
- }) => {
93
- log({ type: "batching_complete", ...data });
94
- },
95
-
96
- // Strategy execution
97
- strategyRunStart: (data: { strategy: string; estimatedSteps: number; artifactCount: number }) => {
98
- log({ type: "strategy_run_start", ...data });
99
- },
100
-
101
- step: (data: StepInfo & { strategy: string }) => {
102
- log({ type: "step", ...data });
103
- },
104
-
105
- progress: (data: ProgressInfo & { strategy: string; context?: string }) => {
106
- log({ type: "progress", ...data });
107
- },
108
-
109
- // LLM calls
110
- llmCallStart: (data: {
111
- callId: string;
112
- model: string;
113
- schemaName?: string;
114
- systemLength: number;
115
- userLength: number;
116
- artifactCount: number;
117
- }) => {
118
- log({ type: "llm_call_start", ...data });
119
- },
120
-
121
- llmCallComplete: (data: {
122
- callId: string;
123
- success: boolean;
124
- inputTokens: number;
125
- outputTokens: number;
126
- totalTokens: number;
127
- durationMs?: number;
128
- error?: string;
129
- }) => {
130
- log({ type: "llm_call_complete", ...data });
131
- },
132
-
133
- // Retry events
134
- retry: (data: RetryInfo & { callId: string }) => {
135
- log({ type: "retry", ...data });
136
- },
137
-
138
- // Validation
139
- validationStart: (data: { callId: string; attempt: number; maxAttempts: number; strict: boolean }) => {
140
- log({ type: "validation_start", ...data });
141
- },
142
-
143
- validationSuccess: (data: { callId: string; attempt: number }) => {
144
- log({ type: "validation_success", ...data });
145
- },
146
-
147
- validationFailed: (data: { callId: string; attempt: number; errors: unknown[] }) => {
148
- log({ type: "validation_failed", ...data });
149
- },
150
-
151
- // Merging
152
- mergeStart: (data: { mergeId: string; inputCount: number; strategy: string }) => {
153
- log({ type: "merge_start", ...data });
154
- },
155
-
156
- mergeComplete: (data: { mergeId: string; success: boolean; error?: string }) => {
157
- log({ type: "merge_complete", ...data });
158
- },
159
-
160
- // Deduplication
161
- dedupeStart: (data: { dedupeId: string; itemCount: number }) => {
162
- log({ type: "dedupe_start", ...data });
163
- },
164
-
165
- dedupeComplete: (data: { dedupeId: string; duplicatesFound: number; itemsRemoved: number }) => {
166
- log({ type: "dedupe_complete", ...data });
167
- },
168
-
169
- // Token usage tracking
170
- tokenUsage: (data: TokenUsageInfo & { context: string }) => {
171
- log({ type: "token_usage", ...data });
172
- },
173
-
174
- // Results
175
- extractionComplete: (data: {
176
- success: boolean;
177
- totalInputTokens: number;
178
- totalOutputTokens: number;
179
- totalTokens: number;
180
- error?: string;
181
- }) => {
182
- log({ type: "extraction_complete", ...data });
183
- },
184
-
185
- // Prompt details (verbose)
186
- promptSystem: (data: { callId: string; system: string }) => {
187
- log({ type: "prompt_system", ...data });
188
- },
189
-
190
- promptUser: (data: { callId: string; user: unknown }) => {
191
- log({ type: "prompt_user", ...data });
192
- },
193
-
194
- // Raw response
195
- rawResponse: (data: { callId: string; response: unknown }) => {
196
- log({ type: "raw_response", ...data });
197
- },
198
-
199
- // Smart merge details
200
- smartMergeField: (data: {
201
- mergeId: string;
202
- field: string;
203
- operation: "merge_arrays" | "merge_objects" | "replace" | "concat";
204
- leftCount?: number;
205
- rightCount?: number;
206
- resultCount?: number;
207
- }) => {
208
- log({ type: "smart_merge_field", ...data });
209
- },
210
- };
211
- };
@@ -1,22 +0,0 @@
1
- import { test, expect } from "bun:test";
2
- import { extract } from "./extract";
3
- import type { ExtractionStrategy, ExtractionOptions } from "./types";
4
-
5
- test("extract delegates to strategy", async () => {
6
- const strategy: ExtractionStrategy<{ ok: boolean }> = {
7
- name: "mock",
8
- run: async () => ({
9
- data: { ok: true },
10
- usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 },
11
- }),
12
- };
13
-
14
- const options: ExtractionOptions<{ ok: boolean }> = {
15
- artifacts: [],
16
- schema: {},
17
- strategy,
18
- };
19
-
20
- const result = await extract(options);
21
- expect(result.data.ok).toBe(true);
22
- });
package/src/extract.ts DELETED
@@ -1,150 +0,0 @@
1
- import type { ExtractionOptions, ExtractionResult } from "./types";
2
- import { buildSchemaFromFields } from "./fields";
3
-
4
- const emptyUsage = { inputTokens: 0, outputTokens: 0, totalTokens: 0 };
5
-
6
- /**
7
- * Resolve and validate the schema from ExtractionOptions.
8
- * Exactly one of `schema` or `fields` must be provided.
9
- */
10
- const resolveSchema = <T>(options: ExtractionOptions<T>) => {
11
- const hasSchema = options.schema !== undefined;
12
- const hasFields = options.fields !== undefined;
13
-
14
- if (hasSchema && hasFields) {
15
- throw new Error(
16
- "Provide either `schema` or `fields`, not both. They are mutually exclusive.",
17
- );
18
- }
19
-
20
- if (!hasSchema && !hasFields) {
21
- throw new Error(
22
- "A schema definition is required. Provide `schema` (a JSON Schema object) or `fields` (a shorthand fields string).",
23
- );
24
- }
25
-
26
- if (hasFields) {
27
- return buildSchemaFromFields(options.fields as string);
28
- }
29
-
30
- return options.schema as NonNullable<typeof options.schema>;
31
- };
32
-
33
- export const extract = async <T>(
34
- options: ExtractionOptions<T>,
35
- ): Promise<ExtractionResult<T>> => {
36
- const debug = options.debug;
37
- const telemetry = options.telemetry;
38
-
39
- // Initialize telemetry if provided
40
- if (telemetry) {
41
- await telemetry.initialize();
42
- }
43
-
44
- // Start root extraction span
45
- const rootSpan = telemetry?.startSpan({
46
- name: "struktur.extract",
47
- kind: "CHAIN",
48
- attributes: {
49
- "extraction.strategy": options.strategy?.name ?? "default",
50
- "extraction.artifacts.count": options.artifacts.length,
51
- },
52
- });
53
-
54
- try {
55
- // Validate mutual exclusion and resolve the concrete schema early so that
56
- // every strategy receives a fully-populated options object.
57
- let resolvedOptions: ExtractionOptions<T>;
58
- try {
59
- const schema = resolveSchema(options);
60
- resolvedOptions = { ...options, schema };
61
- } catch (error) {
62
- debug?.extractionComplete({
63
- success: false,
64
- totalInputTokens: 0,
65
- totalOutputTokens: 0,
66
- totalTokens: 0,
67
- error: (error as Error).message,
68
- });
69
-
70
- telemetry?.endSpan(rootSpan!, {
71
- status: "error",
72
- error: error as Error,
73
- });
74
- await telemetry?.shutdown();
75
-
76
- return {
77
- data: null as unknown as T,
78
- usage: emptyUsage,
79
- error: error as Error,
80
- };
81
- }
82
-
83
- const total = resolvedOptions.strategy.getEstimatedSteps?.(resolvedOptions.artifacts);
84
-
85
- debug?.strategyRunStart({
86
- strategy: resolvedOptions.strategy.name,
87
- estimatedSteps: total ?? 1,
88
- artifactCount: resolvedOptions.artifacts.length,
89
- });
90
-
91
- await resolvedOptions.events?.onStep?.({ step: 1, total, label: "start" });
92
- debug?.step({
93
- step: 1,
94
- total,
95
- label: "start",
96
- strategy: resolvedOptions.strategy.name,
97
- });
98
-
99
- const result = await resolvedOptions.strategy.run(resolvedOptions);
100
-
101
- await resolvedOptions.events?.onStep?.({
102
- step: total ?? 1,
103
- total,
104
- label: "complete",
105
- });
106
- debug?.step({
107
- step: total ?? 1,
108
- total,
109
- label: "complete",
110
- strategy: resolvedOptions.strategy.name,
111
- });
112
-
113
- debug?.extractionComplete({
114
- success: !result.error,
115
- totalInputTokens: result.usage.inputTokens,
116
- totalOutputTokens: result.usage.outputTokens,
117
- totalTokens: result.usage.totalTokens,
118
- error: result.error?.message,
119
- });
120
-
121
- telemetry?.endSpan(rootSpan!, {
122
- status: result.error ? "error" : "ok",
123
- output: result.data,
124
- error: result.error,
125
- });
126
- await telemetry?.shutdown();
127
-
128
- return result;
129
- } catch (error) {
130
- debug?.extractionComplete({
131
- success: false,
132
- totalInputTokens: 0,
133
- totalOutputTokens: 0,
134
- totalTokens: 0,
135
- error: (error as Error).message,
136
- });
137
-
138
- telemetry?.endSpan(rootSpan!, {
139
- status: "error",
140
- error: error as Error,
141
- });
142
- await telemetry?.shutdown();
143
-
144
- return {
145
- data: null as unknown as T,
146
- usage: emptyUsage,
147
- error: error as Error,
148
- };
149
- }
150
- };