@struktur/sdk 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +79 -0
- package/package.json +33 -0
- package/src/artifacts/AGENTS.md +16 -0
- package/src/artifacts/fileToArtifact.test.ts +37 -0
- package/src/artifacts/fileToArtifact.ts +44 -0
- package/src/artifacts/input.test.ts +243 -0
- package/src/artifacts/input.ts +360 -0
- package/src/artifacts/providers.test.ts +19 -0
- package/src/artifacts/providers.ts +7 -0
- package/src/artifacts/urlToArtifact.test.ts +23 -0
- package/src/artifacts/urlToArtifact.ts +19 -0
- package/src/auth/AGENTS.md +11 -0
- package/src/auth/config.test.ts +132 -0
- package/src/auth/config.ts +129 -0
- package/src/auth/tokens.test.ts +58 -0
- package/src/auth/tokens.ts +229 -0
- package/src/chunking/AGENTS.md +11 -0
- package/src/chunking/ArtifactBatcher.test.ts +22 -0
- package/src/chunking/ArtifactBatcher.ts +110 -0
- package/src/chunking/ArtifactSplitter.test.ts +38 -0
- package/src/chunking/ArtifactSplitter.ts +151 -0
- package/src/debug/AGENTS.md +79 -0
- package/src/debug/logger.test.ts +244 -0
- package/src/debug/logger.ts +211 -0
- package/src/extract.test.ts +22 -0
- package/src/extract.ts +114 -0
- package/src/fields.test.ts +663 -0
- package/src/fields.ts +239 -0
- package/src/index.test.ts +20 -0
- package/src/index.ts +93 -0
- package/src/llm/AGENTS.md +9 -0
- package/src/llm/LLMClient.test.ts +196 -0
- package/src/llm/LLMClient.ts +106 -0
- package/src/llm/RetryingRunner.test.ts +174 -0
- package/src/llm/RetryingRunner.ts +188 -0
- package/src/llm/message.test.ts +42 -0
- package/src/llm/message.ts +47 -0
- package/src/llm/models.test.ts +82 -0
- package/src/llm/models.ts +190 -0
- package/src/merge/AGENTS.md +6 -0
- package/src/merge/Deduplicator.test.ts +108 -0
- package/src/merge/Deduplicator.ts +45 -0
- package/src/merge/SmartDataMerger.test.ts +177 -0
- package/src/merge/SmartDataMerger.ts +56 -0
- package/src/parsers/AGENTS.md +58 -0
- package/src/parsers/collect.test.ts +56 -0
- package/src/parsers/collect.ts +31 -0
- package/src/parsers/index.ts +6 -0
- package/src/parsers/mime.test.ts +91 -0
- package/src/parsers/mime.ts +137 -0
- package/src/parsers/npm.ts +26 -0
- package/src/parsers/pdf.test.ts +394 -0
- package/src/parsers/pdf.ts +194 -0
- package/src/parsers/runner.test.ts +95 -0
- package/src/parsers/runner.ts +177 -0
- package/src/parsers/types.ts +29 -0
- package/src/prompts/AGENTS.md +8 -0
- package/src/prompts/DeduplicationPrompt.test.ts +41 -0
- package/src/prompts/DeduplicationPrompt.ts +37 -0
- package/src/prompts/ExtractorPrompt.test.ts +21 -0
- package/src/prompts/ExtractorPrompt.ts +72 -0
- package/src/prompts/ParallelMergerPrompt.test.ts +8 -0
- package/src/prompts/ParallelMergerPrompt.ts +37 -0
- package/src/prompts/SequentialExtractorPrompt.test.ts +24 -0
- package/src/prompts/SequentialExtractorPrompt.ts +82 -0
- package/src/prompts/formatArtifacts.test.ts +39 -0
- package/src/prompts/formatArtifacts.ts +46 -0
- package/src/strategies/AGENTS.md +6 -0
- package/src/strategies/DoublePassAutoMergeStrategy.test.ts +53 -0
- package/src/strategies/DoublePassAutoMergeStrategy.ts +270 -0
- package/src/strategies/DoublePassStrategy.test.ts +48 -0
- package/src/strategies/DoublePassStrategy.ts +179 -0
- package/src/strategies/ParallelAutoMergeStrategy.test.ts +152 -0
- package/src/strategies/ParallelAutoMergeStrategy.ts +241 -0
- package/src/strategies/ParallelStrategy.test.ts +61 -0
- package/src/strategies/ParallelStrategy.ts +157 -0
- package/src/strategies/SequentialAutoMergeStrategy.test.ts +66 -0
- package/src/strategies/SequentialAutoMergeStrategy.ts +222 -0
- package/src/strategies/SequentialStrategy.test.ts +53 -0
- package/src/strategies/SequentialStrategy.ts +119 -0
- package/src/strategies/SimpleStrategy.test.ts +46 -0
- package/src/strategies/SimpleStrategy.ts +74 -0
- package/src/strategies/concurrency.test.ts +16 -0
- package/src/strategies/concurrency.ts +14 -0
- package/src/strategies/index.test.ts +20 -0
- package/src/strategies/index.ts +7 -0
- package/src/strategies/utils.test.ts +76 -0
- package/src/strategies/utils.ts +56 -0
- package/src/tokenization.test.ts +119 -0
- package/src/tokenization.ts +71 -0
- package/src/types.test.ts +25 -0
- package/src/types.ts +116 -0
- package/src/validation/AGENTS.md +6 -0
- package/src/validation/validator.test.ts +172 -0
- package/src/validation/validator.ts +82 -0
- package/tsconfig.json +22 -0
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
import { test, expect } from "bun:test";
|
|
2
|
+
import path from "node:path";
|
|
3
|
+
import os from "node:os";
|
|
4
|
+
import { rm } from "node:fs/promises";
|
|
5
|
+
import {
|
|
6
|
+
deleteProviderToken,
|
|
7
|
+
getProviderTokenOrThrow,
|
|
8
|
+
listStoredProviders,
|
|
9
|
+
resolveProviderToken,
|
|
10
|
+
setProviderToken,
|
|
11
|
+
} from "./tokens";
|
|
12
|
+
|
|
13
|
+
const makeTempDir = () => {
|
|
14
|
+
const suffix = Math.random().toString(16).slice(2);
|
|
15
|
+
return path.join(os.tmpdir(), `struktur-test-${suffix}`);
|
|
16
|
+
};
|
|
17
|
+
|
|
18
|
+
test("setProviderToken stores token in file when keychain disabled", async () => {
|
|
19
|
+
const tempDir = makeTempDir();
|
|
20
|
+
process.env.STRUKTUR_CONFIG_DIR = tempDir;
|
|
21
|
+
process.env.STRUKTUR_DISABLE_KEYCHAIN = "1";
|
|
22
|
+
|
|
23
|
+
try {
|
|
24
|
+
const storage = await setProviderToken("openai", "sk-test", "auto");
|
|
25
|
+
expect(storage).toBe("file");
|
|
26
|
+
|
|
27
|
+
const resolved = await resolveProviderToken("openai");
|
|
28
|
+
expect(resolved).toBe("sk-test");
|
|
29
|
+
|
|
30
|
+
const listed = await listStoredProviders();
|
|
31
|
+
expect(listed).toEqual([{ provider: "openai", storage: "file" }]);
|
|
32
|
+
|
|
33
|
+
const token = await getProviderTokenOrThrow("openai");
|
|
34
|
+
expect(token).toBe("sk-test");
|
|
35
|
+
} finally {
|
|
36
|
+
delete process.env.STRUKTUR_CONFIG_DIR;
|
|
37
|
+
delete process.env.STRUKTUR_DISABLE_KEYCHAIN;
|
|
38
|
+
await rm(tempDir, { recursive: true, force: true });
|
|
39
|
+
}
|
|
40
|
+
});
|
|
41
|
+
|
|
42
|
+
test("deleteProviderToken removes stored token", async () => {
|
|
43
|
+
const tempDir = makeTempDir();
|
|
44
|
+
process.env.STRUKTUR_CONFIG_DIR = tempDir;
|
|
45
|
+
process.env.STRUKTUR_DISABLE_KEYCHAIN = "1";
|
|
46
|
+
|
|
47
|
+
try {
|
|
48
|
+
await setProviderToken("anthropic", "sk-test", "auto");
|
|
49
|
+
const deleted = await deleteProviderToken("anthropic");
|
|
50
|
+
expect(deleted).toBe(true);
|
|
51
|
+
const resolved = await resolveProviderToken("anthropic");
|
|
52
|
+
expect(resolved).toBeUndefined();
|
|
53
|
+
} finally {
|
|
54
|
+
delete process.env.STRUKTUR_CONFIG_DIR;
|
|
55
|
+
delete process.env.STRUKTUR_DISABLE_KEYCHAIN;
|
|
56
|
+
await rm(tempDir, { recursive: true, force: true });
|
|
57
|
+
}
|
|
58
|
+
});
|
|
@@ -0,0 +1,229 @@
|
|
|
1
|
+
import path from "node:path";
|
|
2
|
+
import os from "node:os";
|
|
3
|
+
import { chmod, mkdir } from "node:fs/promises";
|
|
4
|
+
|
|
5
|
+
export type TokenStorageType = "auto" | "keychain" | "file";
|
|
6
|
+
|
|
7
|
+
export type TokenEntry = {
|
|
8
|
+
storage: "keychain" | "file";
|
|
9
|
+
token?: string;
|
|
10
|
+
account?: string;
|
|
11
|
+
service?: string;
|
|
12
|
+
};
|
|
13
|
+
|
|
14
|
+
type TokenStore = {
|
|
15
|
+
version: 1;
|
|
16
|
+
providers: Record<string, TokenEntry>;
|
|
17
|
+
};
|
|
18
|
+
|
|
19
|
+
const CONFIG_DIR_ENV = "STRUKTUR_CONFIG_DIR";
|
|
20
|
+
const DISABLE_KEYCHAIN_ENV = "STRUKTUR_DISABLE_KEYCHAIN";
|
|
21
|
+
const SERVICE_ENV = "STRUKTUR_KEYCHAIN_SERVICE";
|
|
22
|
+
const DEFAULT_SERVICE = "struktur";
|
|
23
|
+
|
|
24
|
+
const resolveConfigDir = () => {
|
|
25
|
+
return process.env[CONFIG_DIR_ENV] ?? path.join(os.homedir(), ".config", "struktur");
|
|
26
|
+
};
|
|
27
|
+
|
|
28
|
+
const resolveTokensPath = () => path.join(resolveConfigDir(), "tokens.json");
|
|
29
|
+
|
|
30
|
+
const emptyStore = (): TokenStore => ({ version: 1, providers: {} });
|
|
31
|
+
|
|
32
|
+
const readTokenStore = async (): Promise<TokenStore> => {
|
|
33
|
+
const tokensPath = resolveTokensPath();
|
|
34
|
+
const exists = await Bun.file(tokensPath).exists();
|
|
35
|
+
if (!exists) {
|
|
36
|
+
return emptyStore();
|
|
37
|
+
}
|
|
38
|
+
const raw = await Bun.file(tokensPath).text();
|
|
39
|
+
const parsed = JSON.parse(raw) as TokenStore;
|
|
40
|
+
if (!parsed || parsed.version !== 1 || typeof parsed.providers !== "object") {
|
|
41
|
+
return emptyStore();
|
|
42
|
+
}
|
|
43
|
+
return parsed;
|
|
44
|
+
};
|
|
45
|
+
|
|
46
|
+
const writeTokenStore = async (store: TokenStore) => {
|
|
47
|
+
const configDir = resolveConfigDir();
|
|
48
|
+
const tokensPath = resolveTokensPath();
|
|
49
|
+
await mkdir(configDir, { recursive: true, mode: 0o700 });
|
|
50
|
+
await Bun.write(tokensPath, JSON.stringify(store, null, 2));
|
|
51
|
+
await chmod(configDir, 0o700);
|
|
52
|
+
await chmod(tokensPath, 0o600);
|
|
53
|
+
};
|
|
54
|
+
|
|
55
|
+
const isKeychainAvailable = async () => {
|
|
56
|
+
if (process.env[DISABLE_KEYCHAIN_ENV]) {
|
|
57
|
+
return false;
|
|
58
|
+
}
|
|
59
|
+
if (process.platform !== "darwin") {
|
|
60
|
+
return false;
|
|
61
|
+
}
|
|
62
|
+
return await Bun.file("/usr/bin/security").exists();
|
|
63
|
+
};
|
|
64
|
+
|
|
65
|
+
const keychainService = () => process.env[SERVICE_ENV] ?? DEFAULT_SERVICE;
|
|
66
|
+
|
|
67
|
+
const runSecurity = async (args: string[]) => {
|
|
68
|
+
const proc = Bun.spawn({
|
|
69
|
+
cmd: ["/usr/bin/security", ...args],
|
|
70
|
+
stdout: "pipe",
|
|
71
|
+
stderr: "pipe",
|
|
72
|
+
});
|
|
73
|
+
const stdout = await new Response(proc.stdout).text();
|
|
74
|
+
const stderr = await new Response(proc.stderr).text();
|
|
75
|
+
const exitCode = await proc.exited;
|
|
76
|
+
if (exitCode !== 0) {
|
|
77
|
+
const message = stderr.trim() || `security exited with ${exitCode}`;
|
|
78
|
+
throw new Error(message);
|
|
79
|
+
}
|
|
80
|
+
return stdout;
|
|
81
|
+
};
|
|
82
|
+
|
|
83
|
+
const writeKeychainToken = async (provider: string, token: string) => {
|
|
84
|
+
await runSecurity([
|
|
85
|
+
"add-generic-password",
|
|
86
|
+
"-a",
|
|
87
|
+
provider,
|
|
88
|
+
"-s",
|
|
89
|
+
keychainService(),
|
|
90
|
+
"-w",
|
|
91
|
+
token,
|
|
92
|
+
"-U",
|
|
93
|
+
]);
|
|
94
|
+
};
|
|
95
|
+
|
|
96
|
+
const readKeychainToken = async (provider: string) => {
|
|
97
|
+
const output = await runSecurity([
|
|
98
|
+
"find-generic-password",
|
|
99
|
+
"-a",
|
|
100
|
+
provider,
|
|
101
|
+
"-s",
|
|
102
|
+
keychainService(),
|
|
103
|
+
"-w",
|
|
104
|
+
]);
|
|
105
|
+
return output.trim();
|
|
106
|
+
};
|
|
107
|
+
|
|
108
|
+
const deleteKeychainToken = async (provider: string) => {
|
|
109
|
+
await runSecurity([
|
|
110
|
+
"delete-generic-password",
|
|
111
|
+
"-a",
|
|
112
|
+
provider,
|
|
113
|
+
"-s",
|
|
114
|
+
keychainService(),
|
|
115
|
+
]);
|
|
116
|
+
};
|
|
117
|
+
|
|
118
|
+
export const listStoredProviders = async () => {
|
|
119
|
+
const store = await readTokenStore();
|
|
120
|
+
return Object.entries(store.providers).map(([provider, entry]) => ({
|
|
121
|
+
provider,
|
|
122
|
+
storage: entry.storage,
|
|
123
|
+
}));
|
|
124
|
+
};
|
|
125
|
+
|
|
126
|
+
export const setProviderToken = async (
|
|
127
|
+
provider: string,
|
|
128
|
+
token: string,
|
|
129
|
+
storage: TokenStorageType = "auto"
|
|
130
|
+
) => {
|
|
131
|
+
const store = await readTokenStore();
|
|
132
|
+
let resolvedStorage: TokenEntry["storage"] = "file";
|
|
133
|
+
|
|
134
|
+
if (storage === "keychain") {
|
|
135
|
+
if (!(await isKeychainAvailable())) {
|
|
136
|
+
throw new Error("Keychain is not available on this platform.");
|
|
137
|
+
}
|
|
138
|
+
resolvedStorage = "keychain";
|
|
139
|
+
} else if (storage === "auto") {
|
|
140
|
+
resolvedStorage = (await isKeychainAvailable()) ? "keychain" : "file";
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
if (resolvedStorage === "keychain") {
|
|
144
|
+
await writeKeychainToken(provider, token);
|
|
145
|
+
store.providers[provider] = {
|
|
146
|
+
storage: "keychain",
|
|
147
|
+
account: provider,
|
|
148
|
+
service: keychainService(),
|
|
149
|
+
};
|
|
150
|
+
} else {
|
|
151
|
+
store.providers[provider] = {
|
|
152
|
+
storage: "file",
|
|
153
|
+
token,
|
|
154
|
+
};
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
await writeTokenStore(store);
|
|
158
|
+
return resolvedStorage;
|
|
159
|
+
};
|
|
160
|
+
|
|
161
|
+
export const deleteProviderToken = async (provider: string) => {
|
|
162
|
+
const store = await readTokenStore();
|
|
163
|
+
const entry = store.providers[provider];
|
|
164
|
+
if (!entry) {
|
|
165
|
+
return false;
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
if (entry.storage === "keychain") {
|
|
169
|
+
try {
|
|
170
|
+
await deleteKeychainToken(provider);
|
|
171
|
+
} catch {
|
|
172
|
+
// ignore errors for missing keychain items
|
|
173
|
+
}
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
delete store.providers[provider];
|
|
177
|
+
await writeTokenStore(store);
|
|
178
|
+
return true;
|
|
179
|
+
};
|
|
180
|
+
|
|
181
|
+
export const resolveProviderToken = async (provider: string) => {
|
|
182
|
+
const store = await readTokenStore();
|
|
183
|
+
const entry = store.providers[provider];
|
|
184
|
+
if (!entry) {
|
|
185
|
+
return undefined;
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
if (entry.storage === "file") {
|
|
189
|
+
return entry.token;
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
try {
|
|
193
|
+
return await readKeychainToken(provider);
|
|
194
|
+
} catch {
|
|
195
|
+
return undefined;
|
|
196
|
+
}
|
|
197
|
+
};
|
|
198
|
+
|
|
199
|
+
export const getProviderTokenOrThrow = async (provider: string) => {
|
|
200
|
+
const token = await resolveProviderToken(provider);
|
|
201
|
+
if (!token) {
|
|
202
|
+
throw new Error(`No token stored for provider: ${provider}`);
|
|
203
|
+
}
|
|
204
|
+
return token;
|
|
205
|
+
};
|
|
206
|
+
|
|
207
|
+
export const resolveProviderEnvVar = (provider: string) => {
|
|
208
|
+
switch (provider) {
|
|
209
|
+
case "openai":
|
|
210
|
+
return "OPENAI_API_KEY";
|
|
211
|
+
case "anthropic":
|
|
212
|
+
return "ANTHROPIC_API_KEY";
|
|
213
|
+
case "google":
|
|
214
|
+
return "GOOGLE_GENERATIVE_AI_API_KEY";
|
|
215
|
+
case "opencode":
|
|
216
|
+
return "OPENCODE_API_KEY";
|
|
217
|
+
case "openrouter":
|
|
218
|
+
return "OPENROUTER_API_KEY";
|
|
219
|
+
default:
|
|
220
|
+
return undefined;
|
|
221
|
+
}
|
|
222
|
+
};
|
|
223
|
+
|
|
224
|
+
export const maskToken = (token: string) => {
|
|
225
|
+
if (token.length <= 8) {
|
|
226
|
+
return "********";
|
|
227
|
+
}
|
|
228
|
+
return `${token.slice(0, 4)}...${token.slice(-4)}`;
|
|
229
|
+
};
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
Chunking module
|
|
2
|
+
|
|
3
|
+
- Purpose: split and batch artifacts based on token and image limits.
|
|
4
|
+
- Key files: `ArtifactSplitter.ts`, `ArtifactBatcher.ts`.
|
|
5
|
+
- Design: split large artifact contents into parts, then batch parts to fit limits.
|
|
6
|
+
- Tests: `ArtifactSplitter.test.ts`, `ArtifactBatcher.test.ts`.
|
|
7
|
+
|
|
8
|
+
IMPORTANT: When modifying chunking/batching logic, you MUST also update the client-side
|
|
9
|
+
JavaScript implementation in `src/cli.ts` (`generateArtifactViewerHtml`) to keep the
|
|
10
|
+
artifact viewer's chunking visualization in sync. The viewer includes a version stamp
|
|
11
|
+
to help users verify they're comparing the correct algorithm version.
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
import { test, expect } from "bun:test";
|
|
2
|
+
import type { Artifact } from "../types";
|
|
3
|
+
import { batchArtifacts } from "./ArtifactBatcher";
|
|
4
|
+
|
|
5
|
+
const makeArtifact = (id: string, text: string): Artifact => ({
|
|
6
|
+
id,
|
|
7
|
+
type: "text",
|
|
8
|
+
raw: async () => Buffer.from(text),
|
|
9
|
+
contents: [{ text }],
|
|
10
|
+
});
|
|
11
|
+
|
|
12
|
+
test("batchArtifacts respects maxTokens", () => {
|
|
13
|
+
const artifacts = [
|
|
14
|
+
makeArtifact("a1", "abcdefgh"),
|
|
15
|
+
makeArtifact("a2", "abcdefgh"),
|
|
16
|
+
];
|
|
17
|
+
|
|
18
|
+
const batches = batchArtifacts(artifacts, { maxTokens: 2 });
|
|
19
|
+
expect(batches.length).toBe(2);
|
|
20
|
+
expect(batches[0]?.length).toBe(1);
|
|
21
|
+
expect(batches[1]?.length).toBe(1);
|
|
22
|
+
});
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
import type { Artifact } from "../types";
|
|
2
|
+
import type { DebugLogger } from "../debug/logger";
|
|
3
|
+
import {
|
|
4
|
+
countArtifactTokens,
|
|
5
|
+
countArtifactImages,
|
|
6
|
+
type TokenCountOptions,
|
|
7
|
+
} from "../tokenization";
|
|
8
|
+
import { splitArtifact } from "./ArtifactSplitter";
|
|
9
|
+
|
|
10
|
+
export type BatchOptions = TokenCountOptions & {
|
|
11
|
+
maxTokens: number;
|
|
12
|
+
maxImages?: number;
|
|
13
|
+
modelMaxTokens?: number;
|
|
14
|
+
debug?: DebugLogger;
|
|
15
|
+
};
|
|
16
|
+
|
|
17
|
+
export const batchArtifacts = (
|
|
18
|
+
artifacts: Artifact[],
|
|
19
|
+
options: BatchOptions
|
|
20
|
+
): Artifact[][] => {
|
|
21
|
+
const debug = options.debug;
|
|
22
|
+
const maxTokens = options.modelMaxTokens
|
|
23
|
+
? Math.min(options.maxTokens, options.modelMaxTokens)
|
|
24
|
+
: options.maxTokens;
|
|
25
|
+
|
|
26
|
+
// Log batching start
|
|
27
|
+
debug?.batchingStart({
|
|
28
|
+
totalArtifacts: artifacts.length,
|
|
29
|
+
maxTokens: options.maxTokens,
|
|
30
|
+
maxImages: options.maxImages,
|
|
31
|
+
modelMaxTokens: options.modelMaxTokens,
|
|
32
|
+
effectiveMaxTokens: maxTokens,
|
|
33
|
+
});
|
|
34
|
+
|
|
35
|
+
const batches: Artifact[][] = [];
|
|
36
|
+
let currentBatch: Artifact[] = [];
|
|
37
|
+
let currentTokens = 0;
|
|
38
|
+
let currentImages = 0;
|
|
39
|
+
|
|
40
|
+
for (const artifact of artifacts) {
|
|
41
|
+
const splitOptions: any = {
|
|
42
|
+
maxTokens,
|
|
43
|
+
debug,
|
|
44
|
+
};
|
|
45
|
+
if (options.maxImages !== undefined) splitOptions.maxImages = options.maxImages;
|
|
46
|
+
if (options.textTokenRatio !== undefined) splitOptions.textTokenRatio = options.textTokenRatio;
|
|
47
|
+
if (options.defaultImageTokens !== undefined) splitOptions.defaultImageTokens = options.defaultImageTokens;
|
|
48
|
+
|
|
49
|
+
const splits = splitArtifact(artifact, splitOptions);
|
|
50
|
+
|
|
51
|
+
for (const split of splits) {
|
|
52
|
+
const splitTokens = countArtifactTokens(split, options);
|
|
53
|
+
const splitImages = countArtifactImages(split);
|
|
54
|
+
|
|
55
|
+
const exceedsTokens =
|
|
56
|
+
currentBatch.length > 0 && currentTokens + splitTokens > maxTokens;
|
|
57
|
+
const exceedsImages =
|
|
58
|
+
options.maxImages !== undefined &&
|
|
59
|
+
currentBatch.length > 0 &&
|
|
60
|
+
currentImages + splitImages > options.maxImages;
|
|
61
|
+
|
|
62
|
+
if (exceedsTokens || exceedsImages) {
|
|
63
|
+
// Log batch creation
|
|
64
|
+
debug?.batchCreated({
|
|
65
|
+
batchIndex: batches.length,
|
|
66
|
+
artifactCount: currentBatch.length,
|
|
67
|
+
totalTokens: currentTokens,
|
|
68
|
+
totalImages: currentImages,
|
|
69
|
+
artifactIds: currentBatch.map(a => a.id),
|
|
70
|
+
});
|
|
71
|
+
|
|
72
|
+
batches.push(currentBatch);
|
|
73
|
+
currentBatch = [];
|
|
74
|
+
currentTokens = 0;
|
|
75
|
+
currentImages = 0;
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
currentBatch.push(split);
|
|
79
|
+
currentTokens += splitTokens;
|
|
80
|
+
currentImages += splitImages;
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
if (currentBatch.length > 0) {
|
|
85
|
+
// Log final batch
|
|
86
|
+
debug?.batchCreated({
|
|
87
|
+
batchIndex: batches.length,
|
|
88
|
+
artifactCount: currentBatch.length,
|
|
89
|
+
totalTokens: currentTokens,
|
|
90
|
+
totalImages: currentImages,
|
|
91
|
+
artifactIds: currentBatch.map(a => a.id),
|
|
92
|
+
});
|
|
93
|
+
batches.push(currentBatch);
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
// Log batching complete
|
|
97
|
+
debug?.batchingComplete({
|
|
98
|
+
totalBatches: batches.length,
|
|
99
|
+
batches: batches.map((batch, index) => ({
|
|
100
|
+
index,
|
|
101
|
+
artifactCount: batch.length,
|
|
102
|
+
tokens: batch.reduce((sum, a) => sum + (a.tokens ?? 0), 0),
|
|
103
|
+
images: batch.reduce((sum, a) =>
|
|
104
|
+
sum + a.contents.reduce((c, content) => c + (content.media?.length ?? 0), 0), 0
|
|
105
|
+
),
|
|
106
|
+
})),
|
|
107
|
+
});
|
|
108
|
+
|
|
109
|
+
return batches;
|
|
110
|
+
};
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
import { test, expect } from "bun:test";
|
|
2
|
+
import type { Artifact } from "../types";
|
|
3
|
+
import { splitArtifact } from "./ArtifactSplitter";
|
|
4
|
+
|
|
5
|
+
const baseArtifact = (text: string): Artifact => ({
|
|
6
|
+
id: "artifact-1",
|
|
7
|
+
type: "text",
|
|
8
|
+
raw: async () => Buffer.from(text),
|
|
9
|
+
contents: [{ text }],
|
|
10
|
+
});
|
|
11
|
+
|
|
12
|
+
test("splitArtifact splits large text into chunks", () => {
|
|
13
|
+
const artifact = baseArtifact("abcdefghijklmnopqrst");
|
|
14
|
+
const chunks = splitArtifact(artifact, { maxTokens: 2 });
|
|
15
|
+
|
|
16
|
+
expect(chunks.length).toBe(3);
|
|
17
|
+
expect(chunks[0]?.contents[0]?.text).toBe("abcdefgh");
|
|
18
|
+
expect(chunks[1]?.contents[0]?.text).toBe("ijklmnop");
|
|
19
|
+
expect(chunks[2]?.contents[0]?.text).toBe("qrst");
|
|
20
|
+
});
|
|
21
|
+
|
|
22
|
+
test("splitArtifact keeps media on first text chunk", () => {
|
|
23
|
+
const artifact: Artifact = {
|
|
24
|
+
id: "artifact-2",
|
|
25
|
+
type: "pdf",
|
|
26
|
+
raw: async () => Buffer.from(""),
|
|
27
|
+
contents: [
|
|
28
|
+
{
|
|
29
|
+
text: "abcdefghijklmnopqrst",
|
|
30
|
+
media: [{ type: "image", url: "https://example.com/x.png" }],
|
|
31
|
+
},
|
|
32
|
+
],
|
|
33
|
+
};
|
|
34
|
+
|
|
35
|
+
const chunks = splitArtifact(artifact, { maxTokens: 2 });
|
|
36
|
+
expect(chunks[0]?.contents[0]?.media?.length).toBe(1);
|
|
37
|
+
expect(chunks[1]?.contents[0]?.media).toBeUndefined();
|
|
38
|
+
});
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
import type { Artifact, ArtifactContent } from "../types";
|
|
2
|
+
import type { DebugLogger } from "../debug/logger";
|
|
3
|
+
import {
|
|
4
|
+
countContentTokens,
|
|
5
|
+
countArtifactImages,
|
|
6
|
+
countArtifactTokens,
|
|
7
|
+
estimateTextTokens,
|
|
8
|
+
type TokenCountOptions,
|
|
9
|
+
} from "../tokenization";
|
|
10
|
+
|
|
11
|
+
export type SplitOptions = TokenCountOptions & {
|
|
12
|
+
maxTokens: number;
|
|
13
|
+
maxImages?: number;
|
|
14
|
+
debug?: DebugLogger;
|
|
15
|
+
};
|
|
16
|
+
|
|
17
|
+
const splitTextIntoChunks = (
|
|
18
|
+
content: ArtifactContent,
|
|
19
|
+
maxTokens: number,
|
|
20
|
+
options?: TokenCountOptions,
|
|
21
|
+
debug?: DebugLogger,
|
|
22
|
+
artifactId?: string
|
|
23
|
+
): ArtifactContent[] => {
|
|
24
|
+
if (!content.text) {
|
|
25
|
+
return [content];
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
const totalTokens = estimateTextTokens(content.text, options);
|
|
29
|
+
if (totalTokens <= maxTokens) {
|
|
30
|
+
return [content];
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
const ratio = options?.textTokenRatio ?? 4;
|
|
34
|
+
const chunkSize = Math.max(1, maxTokens * ratio);
|
|
35
|
+
const chunks: ArtifactContent[] = [];
|
|
36
|
+
|
|
37
|
+
// Log text splitting
|
|
38
|
+
if (debug && artifactId) {
|
|
39
|
+
debug.chunkingSplit({
|
|
40
|
+
artifactId,
|
|
41
|
+
originalContentCount: 1,
|
|
42
|
+
splitContentCount: Math.ceil(content.text.length / chunkSize),
|
|
43
|
+
splitReason: "text_too_long",
|
|
44
|
+
originalTokens: totalTokens,
|
|
45
|
+
chunkSize,
|
|
46
|
+
});
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
for (let offset = 0; offset < content.text.length; offset += chunkSize) {
|
|
50
|
+
const text = content.text.slice(offset, offset + chunkSize);
|
|
51
|
+
chunks.push({
|
|
52
|
+
page: content.page,
|
|
53
|
+
text,
|
|
54
|
+
media: offset === 0 ? content.media : undefined,
|
|
55
|
+
});
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
return chunks;
|
|
59
|
+
};
|
|
60
|
+
|
|
61
|
+
export const splitArtifact = (
|
|
62
|
+
artifact: Artifact,
|
|
63
|
+
options: SplitOptions
|
|
64
|
+
): Artifact[] => {
|
|
65
|
+
const { maxTokens, maxImages, debug } = options;
|
|
66
|
+
const splitContents: ArtifactContent[] = [];
|
|
67
|
+
|
|
68
|
+
// Log chunking start
|
|
69
|
+
const totalTokens = countArtifactTokens(artifact, options);
|
|
70
|
+
debug?.chunkingStart({
|
|
71
|
+
artifactId: artifact.id,
|
|
72
|
+
totalTokens,
|
|
73
|
+
maxTokens,
|
|
74
|
+
maxImages,
|
|
75
|
+
});
|
|
76
|
+
|
|
77
|
+
for (const content of artifact.contents) {
|
|
78
|
+
splitContents.push(...splitTextIntoChunks(content, maxTokens, options, debug, artifact.id));
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
const chunks: Artifact[] = [];
|
|
82
|
+
let currentContents: ArtifactContent[] = [];
|
|
83
|
+
let currentTokens = 0;
|
|
84
|
+
let currentImages = 0;
|
|
85
|
+
|
|
86
|
+
for (const content of splitContents) {
|
|
87
|
+
const contentTokens = countContentTokens(content, options);
|
|
88
|
+
const contentImages = content.media?.length ?? 0;
|
|
89
|
+
|
|
90
|
+
const exceedsTokens =
|
|
91
|
+
currentContents.length > 0 && currentTokens + contentTokens > maxTokens;
|
|
92
|
+
const exceedsImages =
|
|
93
|
+
maxImages !== undefined &&
|
|
94
|
+
currentContents.length > 0 &&
|
|
95
|
+
currentImages + contentImages > maxImages;
|
|
96
|
+
|
|
97
|
+
if (exceedsTokens || exceedsImages) {
|
|
98
|
+
// Log chunk creation
|
|
99
|
+
if (debug) {
|
|
100
|
+
debug.chunkingSplit({
|
|
101
|
+
artifactId: artifact.id,
|
|
102
|
+
originalContentCount: splitContents.length,
|
|
103
|
+
splitContentCount: chunks.length + 1,
|
|
104
|
+
splitReason: exceedsTokens ? "content_limit" : "content_limit",
|
|
105
|
+
originalTokens: totalTokens,
|
|
106
|
+
chunkSize: maxTokens,
|
|
107
|
+
});
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
chunks.push({
|
|
111
|
+
...artifact,
|
|
112
|
+
id: `${artifact.id}:part:${chunks.length + 1}`,
|
|
113
|
+
contents: currentContents,
|
|
114
|
+
tokens: currentTokens,
|
|
115
|
+
});
|
|
116
|
+
currentContents = [];
|
|
117
|
+
currentTokens = 0;
|
|
118
|
+
currentImages = 0;
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
currentContents.push(content);
|
|
122
|
+
currentTokens += contentTokens;
|
|
123
|
+
currentImages += contentImages;
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
if (currentContents.length > 0) {
|
|
127
|
+
chunks.push({
|
|
128
|
+
...artifact,
|
|
129
|
+
id: `${artifact.id}:part:${chunks.length + 1}`,
|
|
130
|
+
contents: currentContents,
|
|
131
|
+
tokens: currentTokens,
|
|
132
|
+
});
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
if (chunks.length === 0) {
|
|
136
|
+
chunks.push({
|
|
137
|
+
...artifact,
|
|
138
|
+
id: `${artifact.id}:part:1`,
|
|
139
|
+
tokens: countArtifactTokens(artifact, options),
|
|
140
|
+
});
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
// Log chunking result
|
|
144
|
+
debug?.chunkingResult({
|
|
145
|
+
artifactId: artifact.id,
|
|
146
|
+
chunksCreated: chunks.length,
|
|
147
|
+
chunkSizes: chunks.map(c => c.tokens ?? 0),
|
|
148
|
+
});
|
|
149
|
+
|
|
150
|
+
return chunks;
|
|
151
|
+
};
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
# Debug Module
|
|
2
|
+
|
|
3
|
+
## Overview
|
|
4
|
+
|
|
5
|
+
The debug module provides comprehensive JSON logging for the Struktur extraction pipeline. When `--debug` flag is enabled via CLI, every operation is logged as single-line JSON to stderr.
|
|
6
|
+
|
|
7
|
+
## Key Files
|
|
8
|
+
|
|
9
|
+
- `logger.ts`: Core debug logger with structured logging functions for every pipeline stage.
|
|
10
|
+
|
|
11
|
+
## Debug Log Types
|
|
12
|
+
|
|
13
|
+
### CLI Initialization
|
|
14
|
+
- `cli_init`: CLI arguments and configuration
|
|
15
|
+
- `schema_loaded`: Schema source and size
|
|
16
|
+
- `artifacts_loaded`: Artifact count, types, tokens, images
|
|
17
|
+
- `model_resolved`: Model specification resolution
|
|
18
|
+
- `strategy_created`: Strategy selection with config
|
|
19
|
+
|
|
20
|
+
### Chunking
|
|
21
|
+
- `chunking_start`: Per-artifact chunking begins
|
|
22
|
+
- `chunking_split`: Text or content splits due to limits
|
|
23
|
+
- `chunking_result`: Final chunks created with sizes
|
|
24
|
+
|
|
25
|
+
### Batching
|
|
26
|
+
- `batching_start`: Batch creation parameters
|
|
27
|
+
- `batch_created`: Individual batch details
|
|
28
|
+
- `batching_complete`: Summary of all batches
|
|
29
|
+
|
|
30
|
+
### Strategy Execution
|
|
31
|
+
- `strategy_run_start`: Strategy begins with estimated steps
|
|
32
|
+
- `step`: Step progression through pipeline
|
|
33
|
+
- `progress`: Progress updates within steps
|
|
34
|
+
|
|
35
|
+
### LLM Calls
|
|
36
|
+
- `llm_call_start`: API call initiation with prompt sizes
|
|
37
|
+
- `prompt_system`: Full system prompt (verbose)
|
|
38
|
+
- `prompt_user`: Full user content (verbose)
|
|
39
|
+
- `llm_call_complete`: Call completion with tokens/timing
|
|
40
|
+
- `raw_response`: Raw LLM response data (verbose)
|
|
41
|
+
|
|
42
|
+
### Validation
|
|
43
|
+
- `validation_start`: Validation attempt begins
|
|
44
|
+
- `validation_success`: Validation passed
|
|
45
|
+
- `validation_failed`: Validation errors
|
|
46
|
+
- `retry`: Retry attempt triggered
|
|
47
|
+
|
|
48
|
+
### Merging
|
|
49
|
+
- `merge_start`: Merge operation begins
|
|
50
|
+
- `smart_merge_field`: Per-field merge operations
|
|
51
|
+
- `merge_complete`: Merge success/failure
|
|
52
|
+
|
|
53
|
+
### Deduplication
|
|
54
|
+
- `dedupe_start`: Deduplication begins
|
|
55
|
+
- `dedupe_complete`: Duplicates found and removed
|
|
56
|
+
|
|
57
|
+
### Results
|
|
58
|
+
- `token_usage`: Token consumption tracking
|
|
59
|
+
- `extraction_complete`: Final extraction status
|
|
60
|
+
|
|
61
|
+
## Usage
|
|
62
|
+
|
|
63
|
+
Enable via CLI:
|
|
64
|
+
```bash
|
|
65
|
+
struktur extract --debug -t "text to extract" -s schema.json
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
Debug logs are written to stderr as single-line JSON:
|
|
69
|
+
```json
|
|
70
|
+
{"timestamp":"2026-02-24T20:00:00.000Z","type":"cli_init","args":{"strategy":"simple"}}
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
## Design Notes
|
|
74
|
+
|
|
75
|
+
- All logs include ISO8601 timestamps
|
|
76
|
+
- Logs are single-line JSON for easy parsing
|
|
77
|
+
- Output goes to stderr to not interfere with stdout results
|
|
78
|
+
- The debug logger is passed through the entire pipeline via `ExtractionOptions.debug`
|
|
79
|
+
- When debug is disabled (default), all logging calls are no-ops
|