@struktur/sdk 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. package/README.md +79 -0
  2. package/package.json +33 -0
  3. package/src/artifacts/AGENTS.md +16 -0
  4. package/src/artifacts/fileToArtifact.test.ts +37 -0
  5. package/src/artifacts/fileToArtifact.ts +44 -0
  6. package/src/artifacts/input.test.ts +243 -0
  7. package/src/artifacts/input.ts +360 -0
  8. package/src/artifacts/providers.test.ts +19 -0
  9. package/src/artifacts/providers.ts +7 -0
  10. package/src/artifacts/urlToArtifact.test.ts +23 -0
  11. package/src/artifacts/urlToArtifact.ts +19 -0
  12. package/src/auth/AGENTS.md +11 -0
  13. package/src/auth/config.test.ts +132 -0
  14. package/src/auth/config.ts +129 -0
  15. package/src/auth/tokens.test.ts +58 -0
  16. package/src/auth/tokens.ts +229 -0
  17. package/src/chunking/AGENTS.md +11 -0
  18. package/src/chunking/ArtifactBatcher.test.ts +22 -0
  19. package/src/chunking/ArtifactBatcher.ts +110 -0
  20. package/src/chunking/ArtifactSplitter.test.ts +38 -0
  21. package/src/chunking/ArtifactSplitter.ts +151 -0
  22. package/src/debug/AGENTS.md +79 -0
  23. package/src/debug/logger.test.ts +244 -0
  24. package/src/debug/logger.ts +211 -0
  25. package/src/extract.test.ts +22 -0
  26. package/src/extract.ts +114 -0
  27. package/src/fields.test.ts +663 -0
  28. package/src/fields.ts +239 -0
  29. package/src/index.test.ts +20 -0
  30. package/src/index.ts +93 -0
  31. package/src/llm/AGENTS.md +9 -0
  32. package/src/llm/LLMClient.test.ts +196 -0
  33. package/src/llm/LLMClient.ts +106 -0
  34. package/src/llm/RetryingRunner.test.ts +174 -0
  35. package/src/llm/RetryingRunner.ts +188 -0
  36. package/src/llm/message.test.ts +42 -0
  37. package/src/llm/message.ts +47 -0
  38. package/src/llm/models.test.ts +82 -0
  39. package/src/llm/models.ts +190 -0
  40. package/src/merge/AGENTS.md +6 -0
  41. package/src/merge/Deduplicator.test.ts +108 -0
  42. package/src/merge/Deduplicator.ts +45 -0
  43. package/src/merge/SmartDataMerger.test.ts +177 -0
  44. package/src/merge/SmartDataMerger.ts +56 -0
  45. package/src/parsers/AGENTS.md +58 -0
  46. package/src/parsers/collect.test.ts +56 -0
  47. package/src/parsers/collect.ts +31 -0
  48. package/src/parsers/index.ts +6 -0
  49. package/src/parsers/mime.test.ts +91 -0
  50. package/src/parsers/mime.ts +137 -0
  51. package/src/parsers/npm.ts +26 -0
  52. package/src/parsers/pdf.test.ts +394 -0
  53. package/src/parsers/pdf.ts +194 -0
  54. package/src/parsers/runner.test.ts +95 -0
  55. package/src/parsers/runner.ts +177 -0
  56. package/src/parsers/types.ts +29 -0
  57. package/src/prompts/AGENTS.md +8 -0
  58. package/src/prompts/DeduplicationPrompt.test.ts +41 -0
  59. package/src/prompts/DeduplicationPrompt.ts +37 -0
  60. package/src/prompts/ExtractorPrompt.test.ts +21 -0
  61. package/src/prompts/ExtractorPrompt.ts +72 -0
  62. package/src/prompts/ParallelMergerPrompt.test.ts +8 -0
  63. package/src/prompts/ParallelMergerPrompt.ts +37 -0
  64. package/src/prompts/SequentialExtractorPrompt.test.ts +24 -0
  65. package/src/prompts/SequentialExtractorPrompt.ts +82 -0
  66. package/src/prompts/formatArtifacts.test.ts +39 -0
  67. package/src/prompts/formatArtifacts.ts +46 -0
  68. package/src/strategies/AGENTS.md +6 -0
  69. package/src/strategies/DoublePassAutoMergeStrategy.test.ts +53 -0
  70. package/src/strategies/DoublePassAutoMergeStrategy.ts +270 -0
  71. package/src/strategies/DoublePassStrategy.test.ts +48 -0
  72. package/src/strategies/DoublePassStrategy.ts +179 -0
  73. package/src/strategies/ParallelAutoMergeStrategy.test.ts +152 -0
  74. package/src/strategies/ParallelAutoMergeStrategy.ts +241 -0
  75. package/src/strategies/ParallelStrategy.test.ts +61 -0
  76. package/src/strategies/ParallelStrategy.ts +157 -0
  77. package/src/strategies/SequentialAutoMergeStrategy.test.ts +66 -0
  78. package/src/strategies/SequentialAutoMergeStrategy.ts +222 -0
  79. package/src/strategies/SequentialStrategy.test.ts +53 -0
  80. package/src/strategies/SequentialStrategy.ts +119 -0
  81. package/src/strategies/SimpleStrategy.test.ts +46 -0
  82. package/src/strategies/SimpleStrategy.ts +74 -0
  83. package/src/strategies/concurrency.test.ts +16 -0
  84. package/src/strategies/concurrency.ts +14 -0
  85. package/src/strategies/index.test.ts +20 -0
  86. package/src/strategies/index.ts +7 -0
  87. package/src/strategies/utils.test.ts +76 -0
  88. package/src/strategies/utils.ts +56 -0
  89. package/src/tokenization.test.ts +119 -0
  90. package/src/tokenization.ts +71 -0
  91. package/src/types.test.ts +25 -0
  92. package/src/types.ts +116 -0
  93. package/src/validation/AGENTS.md +6 -0
  94. package/src/validation/validator.test.ts +172 -0
  95. package/src/validation/validator.ts +82 -0
  96. package/tsconfig.json +22 -0
@@ -0,0 +1,58 @@
1
+ import { test, expect } from "bun:test";
2
+ import path from "node:path";
3
+ import os from "node:os";
4
+ import { rm } from "node:fs/promises";
5
+ import {
6
+ deleteProviderToken,
7
+ getProviderTokenOrThrow,
8
+ listStoredProviders,
9
+ resolveProviderToken,
10
+ setProviderToken,
11
+ } from "./tokens";
12
+
13
+ const makeTempDir = () => {
14
+ const suffix = Math.random().toString(16).slice(2);
15
+ return path.join(os.tmpdir(), `struktur-test-${suffix}`);
16
+ };
17
+
18
+ test("setProviderToken stores token in file when keychain disabled", async () => {
19
+ const tempDir = makeTempDir();
20
+ process.env.STRUKTUR_CONFIG_DIR = tempDir;
21
+ process.env.STRUKTUR_DISABLE_KEYCHAIN = "1";
22
+
23
+ try {
24
+ const storage = await setProviderToken("openai", "sk-test", "auto");
25
+ expect(storage).toBe("file");
26
+
27
+ const resolved = await resolveProviderToken("openai");
28
+ expect(resolved).toBe("sk-test");
29
+
30
+ const listed = await listStoredProviders();
31
+ expect(listed).toEqual([{ provider: "openai", storage: "file" }]);
32
+
33
+ const token = await getProviderTokenOrThrow("openai");
34
+ expect(token).toBe("sk-test");
35
+ } finally {
36
+ delete process.env.STRUKTUR_CONFIG_DIR;
37
+ delete process.env.STRUKTUR_DISABLE_KEYCHAIN;
38
+ await rm(tempDir, { recursive: true, force: true });
39
+ }
40
+ });
41
+
42
+ test("deleteProviderToken removes stored token", async () => {
43
+ const tempDir = makeTempDir();
44
+ process.env.STRUKTUR_CONFIG_DIR = tempDir;
45
+ process.env.STRUKTUR_DISABLE_KEYCHAIN = "1";
46
+
47
+ try {
48
+ await setProviderToken("anthropic", "sk-test", "auto");
49
+ const deleted = await deleteProviderToken("anthropic");
50
+ expect(deleted).toBe(true);
51
+ const resolved = await resolveProviderToken("anthropic");
52
+ expect(resolved).toBeUndefined();
53
+ } finally {
54
+ delete process.env.STRUKTUR_CONFIG_DIR;
55
+ delete process.env.STRUKTUR_DISABLE_KEYCHAIN;
56
+ await rm(tempDir, { recursive: true, force: true });
57
+ }
58
+ });
@@ -0,0 +1,229 @@
1
+ import path from "node:path";
2
+ import os from "node:os";
3
+ import { chmod, mkdir } from "node:fs/promises";
4
+
5
+ export type TokenStorageType = "auto" | "keychain" | "file";
6
+
7
+ export type TokenEntry = {
8
+ storage: "keychain" | "file";
9
+ token?: string;
10
+ account?: string;
11
+ service?: string;
12
+ };
13
+
14
+ type TokenStore = {
15
+ version: 1;
16
+ providers: Record<string, TokenEntry>;
17
+ };
18
+
19
+ const CONFIG_DIR_ENV = "STRUKTUR_CONFIG_DIR";
20
+ const DISABLE_KEYCHAIN_ENV = "STRUKTUR_DISABLE_KEYCHAIN";
21
+ const SERVICE_ENV = "STRUKTUR_KEYCHAIN_SERVICE";
22
+ const DEFAULT_SERVICE = "struktur";
23
+
24
+ const resolveConfigDir = () => {
25
+ return process.env[CONFIG_DIR_ENV] ?? path.join(os.homedir(), ".config", "struktur");
26
+ };
27
+
28
+ const resolveTokensPath = () => path.join(resolveConfigDir(), "tokens.json");
29
+
30
+ const emptyStore = (): TokenStore => ({ version: 1, providers: {} });
31
+
32
+ const readTokenStore = async (): Promise<TokenStore> => {
33
+ const tokensPath = resolveTokensPath();
34
+ const exists = await Bun.file(tokensPath).exists();
35
+ if (!exists) {
36
+ return emptyStore();
37
+ }
38
+ const raw = await Bun.file(tokensPath).text();
39
+ const parsed = JSON.parse(raw) as TokenStore;
40
+ if (!parsed || parsed.version !== 1 || typeof parsed.providers !== "object") {
41
+ return emptyStore();
42
+ }
43
+ return parsed;
44
+ };
45
+
46
+ const writeTokenStore = async (store: TokenStore) => {
47
+ const configDir = resolveConfigDir();
48
+ const tokensPath = resolveTokensPath();
49
+ await mkdir(configDir, { recursive: true, mode: 0o700 });
50
+ await Bun.write(tokensPath, JSON.stringify(store, null, 2));
51
+ await chmod(configDir, 0o700);
52
+ await chmod(tokensPath, 0o600);
53
+ };
54
+
55
+ const isKeychainAvailable = async () => {
56
+ if (process.env[DISABLE_KEYCHAIN_ENV]) {
57
+ return false;
58
+ }
59
+ if (process.platform !== "darwin") {
60
+ return false;
61
+ }
62
+ return await Bun.file("/usr/bin/security").exists();
63
+ };
64
+
65
+ const keychainService = () => process.env[SERVICE_ENV] ?? DEFAULT_SERVICE;
66
+
67
+ const runSecurity = async (args: string[]) => {
68
+ const proc = Bun.spawn({
69
+ cmd: ["/usr/bin/security", ...args],
70
+ stdout: "pipe",
71
+ stderr: "pipe",
72
+ });
73
+ const stdout = await new Response(proc.stdout).text();
74
+ const stderr = await new Response(proc.stderr).text();
75
+ const exitCode = await proc.exited;
76
+ if (exitCode !== 0) {
77
+ const message = stderr.trim() || `security exited with ${exitCode}`;
78
+ throw new Error(message);
79
+ }
80
+ return stdout;
81
+ };
82
+
83
+ const writeKeychainToken = async (provider: string, token: string) => {
84
+ await runSecurity([
85
+ "add-generic-password",
86
+ "-a",
87
+ provider,
88
+ "-s",
89
+ keychainService(),
90
+ "-w",
91
+ token,
92
+ "-U",
93
+ ]);
94
+ };
95
+
96
+ const readKeychainToken = async (provider: string) => {
97
+ const output = await runSecurity([
98
+ "find-generic-password",
99
+ "-a",
100
+ provider,
101
+ "-s",
102
+ keychainService(),
103
+ "-w",
104
+ ]);
105
+ return output.trim();
106
+ };
107
+
108
+ const deleteKeychainToken = async (provider: string) => {
109
+ await runSecurity([
110
+ "delete-generic-password",
111
+ "-a",
112
+ provider,
113
+ "-s",
114
+ keychainService(),
115
+ ]);
116
+ };
117
+
118
+ export const listStoredProviders = async () => {
119
+ const store = await readTokenStore();
120
+ return Object.entries(store.providers).map(([provider, entry]) => ({
121
+ provider,
122
+ storage: entry.storage,
123
+ }));
124
+ };
125
+
126
+ export const setProviderToken = async (
127
+ provider: string,
128
+ token: string,
129
+ storage: TokenStorageType = "auto"
130
+ ) => {
131
+ const store = await readTokenStore();
132
+ let resolvedStorage: TokenEntry["storage"] = "file";
133
+
134
+ if (storage === "keychain") {
135
+ if (!(await isKeychainAvailable())) {
136
+ throw new Error("Keychain is not available on this platform.");
137
+ }
138
+ resolvedStorage = "keychain";
139
+ } else if (storage === "auto") {
140
+ resolvedStorage = (await isKeychainAvailable()) ? "keychain" : "file";
141
+ }
142
+
143
+ if (resolvedStorage === "keychain") {
144
+ await writeKeychainToken(provider, token);
145
+ store.providers[provider] = {
146
+ storage: "keychain",
147
+ account: provider,
148
+ service: keychainService(),
149
+ };
150
+ } else {
151
+ store.providers[provider] = {
152
+ storage: "file",
153
+ token,
154
+ };
155
+ }
156
+
157
+ await writeTokenStore(store);
158
+ return resolvedStorage;
159
+ };
160
+
161
+ export const deleteProviderToken = async (provider: string) => {
162
+ const store = await readTokenStore();
163
+ const entry = store.providers[provider];
164
+ if (!entry) {
165
+ return false;
166
+ }
167
+
168
+ if (entry.storage === "keychain") {
169
+ try {
170
+ await deleteKeychainToken(provider);
171
+ } catch {
172
+ // ignore errors for missing keychain items
173
+ }
174
+ }
175
+
176
+ delete store.providers[provider];
177
+ await writeTokenStore(store);
178
+ return true;
179
+ };
180
+
181
+ export const resolveProviderToken = async (provider: string) => {
182
+ const store = await readTokenStore();
183
+ const entry = store.providers[provider];
184
+ if (!entry) {
185
+ return undefined;
186
+ }
187
+
188
+ if (entry.storage === "file") {
189
+ return entry.token;
190
+ }
191
+
192
+ try {
193
+ return await readKeychainToken(provider);
194
+ } catch {
195
+ return undefined;
196
+ }
197
+ };
198
+
199
+ export const getProviderTokenOrThrow = async (provider: string) => {
200
+ const token = await resolveProviderToken(provider);
201
+ if (!token) {
202
+ throw new Error(`No token stored for provider: ${provider}`);
203
+ }
204
+ return token;
205
+ };
206
+
207
+ export const resolveProviderEnvVar = (provider: string) => {
208
+ switch (provider) {
209
+ case "openai":
210
+ return "OPENAI_API_KEY";
211
+ case "anthropic":
212
+ return "ANTHROPIC_API_KEY";
213
+ case "google":
214
+ return "GOOGLE_GENERATIVE_AI_API_KEY";
215
+ case "opencode":
216
+ return "OPENCODE_API_KEY";
217
+ case "openrouter":
218
+ return "OPENROUTER_API_KEY";
219
+ default:
220
+ return undefined;
221
+ }
222
+ };
223
+
224
+ export const maskToken = (token: string) => {
225
+ if (token.length <= 8) {
226
+ return "********";
227
+ }
228
+ return `${token.slice(0, 4)}...${token.slice(-4)}`;
229
+ };
@@ -0,0 +1,11 @@
1
+ Chunking module
2
+
3
+ - Purpose: split and batch artifacts based on token and image limits.
4
+ - Key files: `ArtifactSplitter.ts`, `ArtifactBatcher.ts`.
5
+ - Design: split large artifact contents into parts, then batch parts to fit limits.
6
+ - Tests: `ArtifactSplitter.test.ts`, `ArtifactBatcher.test.ts`.
7
+
8
+ IMPORTANT: When modifying chunking/batching logic, you MUST also update the client-side
9
+ JavaScript implementation in `src/cli.ts` (`generateArtifactViewerHtml`) to keep the
10
+ artifact viewer's chunking visualization in sync. The viewer includes a version stamp
11
+ to help users verify they're comparing the correct algorithm version.
@@ -0,0 +1,22 @@
1
+ import { test, expect } from "bun:test";
2
+ import type { Artifact } from "../types";
3
+ import { batchArtifacts } from "./ArtifactBatcher";
4
+
5
+ const makeArtifact = (id: string, text: string): Artifact => ({
6
+ id,
7
+ type: "text",
8
+ raw: async () => Buffer.from(text),
9
+ contents: [{ text }],
10
+ });
11
+
12
+ test("batchArtifacts respects maxTokens", () => {
13
+ const artifacts = [
14
+ makeArtifact("a1", "abcdefgh"),
15
+ makeArtifact("a2", "abcdefgh"),
16
+ ];
17
+
18
+ const batches = batchArtifacts(artifacts, { maxTokens: 2 });
19
+ expect(batches.length).toBe(2);
20
+ expect(batches[0]?.length).toBe(1);
21
+ expect(batches[1]?.length).toBe(1);
22
+ });
@@ -0,0 +1,110 @@
1
+ import type { Artifact } from "../types";
2
+ import type { DebugLogger } from "../debug/logger";
3
+ import {
4
+ countArtifactTokens,
5
+ countArtifactImages,
6
+ type TokenCountOptions,
7
+ } from "../tokenization";
8
+ import { splitArtifact } from "./ArtifactSplitter";
9
+
10
+ export type BatchOptions = TokenCountOptions & {
11
+ maxTokens: number;
12
+ maxImages?: number;
13
+ modelMaxTokens?: number;
14
+ debug?: DebugLogger;
15
+ };
16
+
17
+ export const batchArtifacts = (
18
+ artifacts: Artifact[],
19
+ options: BatchOptions
20
+ ): Artifact[][] => {
21
+ const debug = options.debug;
22
+ const maxTokens = options.modelMaxTokens
23
+ ? Math.min(options.maxTokens, options.modelMaxTokens)
24
+ : options.maxTokens;
25
+
26
+ // Log batching start
27
+ debug?.batchingStart({
28
+ totalArtifacts: artifacts.length,
29
+ maxTokens: options.maxTokens,
30
+ maxImages: options.maxImages,
31
+ modelMaxTokens: options.modelMaxTokens,
32
+ effectiveMaxTokens: maxTokens,
33
+ });
34
+
35
+ const batches: Artifact[][] = [];
36
+ let currentBatch: Artifact[] = [];
37
+ let currentTokens = 0;
38
+ let currentImages = 0;
39
+
40
+ for (const artifact of artifacts) {
41
+ const splitOptions: any = {
42
+ maxTokens,
43
+ debug,
44
+ };
45
+ if (options.maxImages !== undefined) splitOptions.maxImages = options.maxImages;
46
+ if (options.textTokenRatio !== undefined) splitOptions.textTokenRatio = options.textTokenRatio;
47
+ if (options.defaultImageTokens !== undefined) splitOptions.defaultImageTokens = options.defaultImageTokens;
48
+
49
+ const splits = splitArtifact(artifact, splitOptions);
50
+
51
+ for (const split of splits) {
52
+ const splitTokens = countArtifactTokens(split, options);
53
+ const splitImages = countArtifactImages(split);
54
+
55
+ const exceedsTokens =
56
+ currentBatch.length > 0 && currentTokens + splitTokens > maxTokens;
57
+ const exceedsImages =
58
+ options.maxImages !== undefined &&
59
+ currentBatch.length > 0 &&
60
+ currentImages + splitImages > options.maxImages;
61
+
62
+ if (exceedsTokens || exceedsImages) {
63
+ // Log batch creation
64
+ debug?.batchCreated({
65
+ batchIndex: batches.length,
66
+ artifactCount: currentBatch.length,
67
+ totalTokens: currentTokens,
68
+ totalImages: currentImages,
69
+ artifactIds: currentBatch.map(a => a.id),
70
+ });
71
+
72
+ batches.push(currentBatch);
73
+ currentBatch = [];
74
+ currentTokens = 0;
75
+ currentImages = 0;
76
+ }
77
+
78
+ currentBatch.push(split);
79
+ currentTokens += splitTokens;
80
+ currentImages += splitImages;
81
+ }
82
+ }
83
+
84
+ if (currentBatch.length > 0) {
85
+ // Log final batch
86
+ debug?.batchCreated({
87
+ batchIndex: batches.length,
88
+ artifactCount: currentBatch.length,
89
+ totalTokens: currentTokens,
90
+ totalImages: currentImages,
91
+ artifactIds: currentBatch.map(a => a.id),
92
+ });
93
+ batches.push(currentBatch);
94
+ }
95
+
96
+ // Log batching complete
97
+ debug?.batchingComplete({
98
+ totalBatches: batches.length,
99
+ batches: batches.map((batch, index) => ({
100
+ index,
101
+ artifactCount: batch.length,
102
+ tokens: batch.reduce((sum, a) => sum + (a.tokens ?? 0), 0),
103
+ images: batch.reduce((sum, a) =>
104
+ sum + a.contents.reduce((c, content) => c + (content.media?.length ?? 0), 0), 0
105
+ ),
106
+ })),
107
+ });
108
+
109
+ return batches;
110
+ };
@@ -0,0 +1,38 @@
1
+ import { test, expect } from "bun:test";
2
+ import type { Artifact } from "../types";
3
+ import { splitArtifact } from "./ArtifactSplitter";
4
+
5
+ const baseArtifact = (text: string): Artifact => ({
6
+ id: "artifact-1",
7
+ type: "text",
8
+ raw: async () => Buffer.from(text),
9
+ contents: [{ text }],
10
+ });
11
+
12
+ test("splitArtifact splits large text into chunks", () => {
13
+ const artifact = baseArtifact("abcdefghijklmnopqrst");
14
+ const chunks = splitArtifact(artifact, { maxTokens: 2 });
15
+
16
+ expect(chunks.length).toBe(3);
17
+ expect(chunks[0]?.contents[0]?.text).toBe("abcdefgh");
18
+ expect(chunks[1]?.contents[0]?.text).toBe("ijklmnop");
19
+ expect(chunks[2]?.contents[0]?.text).toBe("qrst");
20
+ });
21
+
22
+ test("splitArtifact keeps media on first text chunk", () => {
23
+ const artifact: Artifact = {
24
+ id: "artifact-2",
25
+ type: "pdf",
26
+ raw: async () => Buffer.from(""),
27
+ contents: [
28
+ {
29
+ text: "abcdefghijklmnopqrst",
30
+ media: [{ type: "image", url: "https://example.com/x.png" }],
31
+ },
32
+ ],
33
+ };
34
+
35
+ const chunks = splitArtifact(artifact, { maxTokens: 2 });
36
+ expect(chunks[0]?.contents[0]?.media?.length).toBe(1);
37
+ expect(chunks[1]?.contents[0]?.media).toBeUndefined();
38
+ });
@@ -0,0 +1,151 @@
1
+ import type { Artifact, ArtifactContent } from "../types";
2
+ import type { DebugLogger } from "../debug/logger";
3
+ import {
4
+ countContentTokens,
5
+ countArtifactImages,
6
+ countArtifactTokens,
7
+ estimateTextTokens,
8
+ type TokenCountOptions,
9
+ } from "../tokenization";
10
+
11
+ export type SplitOptions = TokenCountOptions & {
12
+ maxTokens: number;
13
+ maxImages?: number;
14
+ debug?: DebugLogger;
15
+ };
16
+
17
+ const splitTextIntoChunks = (
18
+ content: ArtifactContent,
19
+ maxTokens: number,
20
+ options?: TokenCountOptions,
21
+ debug?: DebugLogger,
22
+ artifactId?: string
23
+ ): ArtifactContent[] => {
24
+ if (!content.text) {
25
+ return [content];
26
+ }
27
+
28
+ const totalTokens = estimateTextTokens(content.text, options);
29
+ if (totalTokens <= maxTokens) {
30
+ return [content];
31
+ }
32
+
33
+ const ratio = options?.textTokenRatio ?? 4;
34
+ const chunkSize = Math.max(1, maxTokens * ratio);
35
+ const chunks: ArtifactContent[] = [];
36
+
37
+ // Log text splitting
38
+ if (debug && artifactId) {
39
+ debug.chunkingSplit({
40
+ artifactId,
41
+ originalContentCount: 1,
42
+ splitContentCount: Math.ceil(content.text.length / chunkSize),
43
+ splitReason: "text_too_long",
44
+ originalTokens: totalTokens,
45
+ chunkSize,
46
+ });
47
+ }
48
+
49
+ for (let offset = 0; offset < content.text.length; offset += chunkSize) {
50
+ const text = content.text.slice(offset, offset + chunkSize);
51
+ chunks.push({
52
+ page: content.page,
53
+ text,
54
+ media: offset === 0 ? content.media : undefined,
55
+ });
56
+ }
57
+
58
+ return chunks;
59
+ };
60
+
61
+ export const splitArtifact = (
62
+ artifact: Artifact,
63
+ options: SplitOptions
64
+ ): Artifact[] => {
65
+ const { maxTokens, maxImages, debug } = options;
66
+ const splitContents: ArtifactContent[] = [];
67
+
68
+ // Log chunking start
69
+ const totalTokens = countArtifactTokens(artifact, options);
70
+ debug?.chunkingStart({
71
+ artifactId: artifact.id,
72
+ totalTokens,
73
+ maxTokens,
74
+ maxImages,
75
+ });
76
+
77
+ for (const content of artifact.contents) {
78
+ splitContents.push(...splitTextIntoChunks(content, maxTokens, options, debug, artifact.id));
79
+ }
80
+
81
+ const chunks: Artifact[] = [];
82
+ let currentContents: ArtifactContent[] = [];
83
+ let currentTokens = 0;
84
+ let currentImages = 0;
85
+
86
+ for (const content of splitContents) {
87
+ const contentTokens = countContentTokens(content, options);
88
+ const contentImages = content.media?.length ?? 0;
89
+
90
+ const exceedsTokens =
91
+ currentContents.length > 0 && currentTokens + contentTokens > maxTokens;
92
+ const exceedsImages =
93
+ maxImages !== undefined &&
94
+ currentContents.length > 0 &&
95
+ currentImages + contentImages > maxImages;
96
+
97
+ if (exceedsTokens || exceedsImages) {
98
+ // Log chunk creation
99
+ if (debug) {
100
+ debug.chunkingSplit({
101
+ artifactId: artifact.id,
102
+ originalContentCount: splitContents.length,
103
+ splitContentCount: chunks.length + 1,
104
+ splitReason: exceedsTokens ? "content_limit" : "content_limit",
105
+ originalTokens: totalTokens,
106
+ chunkSize: maxTokens,
107
+ });
108
+ }
109
+
110
+ chunks.push({
111
+ ...artifact,
112
+ id: `${artifact.id}:part:${chunks.length + 1}`,
113
+ contents: currentContents,
114
+ tokens: currentTokens,
115
+ });
116
+ currentContents = [];
117
+ currentTokens = 0;
118
+ currentImages = 0;
119
+ }
120
+
121
+ currentContents.push(content);
122
+ currentTokens += contentTokens;
123
+ currentImages += contentImages;
124
+ }
125
+
126
+ if (currentContents.length > 0) {
127
+ chunks.push({
128
+ ...artifact,
129
+ id: `${artifact.id}:part:${chunks.length + 1}`,
130
+ contents: currentContents,
131
+ tokens: currentTokens,
132
+ });
133
+ }
134
+
135
+ if (chunks.length === 0) {
136
+ chunks.push({
137
+ ...artifact,
138
+ id: `${artifact.id}:part:1`,
139
+ tokens: countArtifactTokens(artifact, options),
140
+ });
141
+ }
142
+
143
+ // Log chunking result
144
+ debug?.chunkingResult({
145
+ artifactId: artifact.id,
146
+ chunksCreated: chunks.length,
147
+ chunkSizes: chunks.map(c => c.tokens ?? 0),
148
+ });
149
+
150
+ return chunks;
151
+ };
@@ -0,0 +1,79 @@
1
+ # Debug Module
2
+
3
+ ## Overview
4
+
5
+ The debug module provides comprehensive JSON logging for the Struktur extraction pipeline. When `--debug` flag is enabled via CLI, every operation is logged as single-line JSON to stderr.
6
+
7
+ ## Key Files
8
+
9
+ - `logger.ts`: Core debug logger with structured logging functions for every pipeline stage.
10
+
11
+ ## Debug Log Types
12
+
13
+ ### CLI Initialization
14
+ - `cli_init`: CLI arguments and configuration
15
+ - `schema_loaded`: Schema source and size
16
+ - `artifacts_loaded`: Artifact count, types, tokens, images
17
+ - `model_resolved`: Model specification resolution
18
+ - `strategy_created`: Strategy selection with config
19
+
20
+ ### Chunking
21
+ - `chunking_start`: Per-artifact chunking begins
22
+ - `chunking_split`: Text or content splits due to limits
23
+ - `chunking_result`: Final chunks created with sizes
24
+
25
+ ### Batching
26
+ - `batching_start`: Batch creation parameters
27
+ - `batch_created`: Individual batch details
28
+ - `batching_complete`: Summary of all batches
29
+
30
+ ### Strategy Execution
31
+ - `strategy_run_start`: Strategy begins with estimated steps
32
+ - `step`: Step progression through pipeline
33
+ - `progress`: Progress updates within steps
34
+
35
+ ### LLM Calls
36
+ - `llm_call_start`: API call initiation with prompt sizes
37
+ - `prompt_system`: Full system prompt (verbose)
38
+ - `prompt_user`: Full user content (verbose)
39
+ - `llm_call_complete`: Call completion with tokens/timing
40
+ - `raw_response`: Raw LLM response data (verbose)
41
+
42
+ ### Validation
43
+ - `validation_start`: Validation attempt begins
44
+ - `validation_success`: Validation passed
45
+ - `validation_failed`: Validation errors
46
+ - `retry`: Retry attempt triggered
47
+
48
+ ### Merging
49
+ - `merge_start`: Merge operation begins
50
+ - `smart_merge_field`: Per-field merge operations
51
+ - `merge_complete`: Merge success/failure
52
+
53
+ ### Deduplication
54
+ - `dedupe_start`: Deduplication begins
55
+ - `dedupe_complete`: Duplicates found and removed
56
+
57
+ ### Results
58
+ - `token_usage`: Token consumption tracking
59
+ - `extraction_complete`: Final extraction status
60
+
61
+ ## Usage
62
+
63
+ Enable via CLI:
64
+ ```bash
65
+ struktur extract --debug -t "text to extract" -s schema.json
66
+ ```
67
+
68
+ Debug logs are written to stderr as single-line JSON:
69
+ ```json
70
+ {"timestamp":"2026-02-24T20:00:00.000Z","type":"cli_init","args":{"strategy":"simple"}}
71
+ ```
72
+
73
+ ## Design Notes
74
+
75
+ - All logs include ISO8601 timestamps
76
+ - Logs are single-line JSON for easy parsing
77
+ - Output goes to stderr to not interfere with stdout results
78
+ - The debug logger is passed through the entire pipeline via `ExtractionOptions.debug`
79
+ - When debug is disabled (default), all logging calls are no-ops