@struktur/sdk 2.1.2 → 2.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/artifacts/fileToArtifact.d.ts +8 -0
- package/dist/artifacts/fileToArtifact.d.ts.map +1 -0
- package/dist/artifacts/input.d.ts +60 -0
- package/dist/artifacts/input.d.ts.map +1 -0
- package/{src/artifacts/providers.ts → dist/artifacts/providers.d.ts} +2 -4
- package/dist/artifacts/providers.d.ts.map +1 -0
- package/dist/artifacts/urlToArtifact.d.ts +3 -0
- package/dist/artifacts/urlToArtifact.d.ts.map +1 -0
- package/dist/auth/config.d.ts +34 -0
- package/dist/auth/config.d.ts.map +1 -0
- package/dist/auth/tokens.d.ts +18 -0
- package/dist/auth/tokens.d.ts.map +1 -0
- package/dist/chunking/ArtifactBatcher.d.ts +11 -0
- package/dist/chunking/ArtifactBatcher.d.ts.map +1 -0
- package/dist/chunking/ArtifactSplitter.d.ts +10 -0
- package/dist/chunking/ArtifactSplitter.d.ts.map +1 -0
- package/dist/debug/logger.d.ts +169 -0
- package/dist/debug/logger.d.ts.map +1 -0
- package/dist/extract.d.ts +3 -0
- package/dist/extract.d.ts.map +1 -0
- package/dist/fields.d.ts +75 -0
- package/dist/fields.d.ts.map +1 -0
- package/dist/index.d.ts +24 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +5603 -0
- package/dist/index.js.map +1 -0
- package/dist/llm/LLMClient.d.ts +40 -0
- package/dist/llm/LLMClient.d.ts.map +1 -0
- package/dist/llm/RetryingRunner.d.ts +37 -0
- package/dist/llm/RetryingRunner.d.ts.map +1 -0
- package/dist/llm/message.d.ts +12 -0
- package/dist/llm/message.d.ts.map +1 -0
- package/dist/llm/models.d.ts +13 -0
- package/dist/llm/models.d.ts.map +1 -0
- package/dist/llm/resolveModel.d.ts +3 -0
- package/dist/llm/resolveModel.d.ts.map +1 -0
- package/dist/merge/Deduplicator.d.ts +4 -0
- package/dist/merge/Deduplicator.d.ts.map +1 -0
- package/dist/merge/SmartDataMerger.d.ts +7 -0
- package/dist/merge/SmartDataMerger.d.ts.map +1 -0
- package/dist/parsers/collect.d.ts +7 -0
- package/dist/parsers/collect.d.ts.map +1 -0
- package/{src/parsers/index.ts → dist/parsers/index.d.ts} +1 -0
- package/dist/parsers/index.d.ts.map +1 -0
- package/dist/parsers/mime.d.ts +12 -0
- package/dist/parsers/mime.d.ts.map +1 -0
- package/dist/parsers/npm.d.ts +16 -0
- package/dist/parsers/npm.d.ts.map +1 -0
- package/dist/parsers/pdf.d.ts +36 -0
- package/dist/parsers/pdf.d.ts.map +1 -0
- package/dist/parsers/runner.d.ts +4 -0
- package/dist/parsers/runner.d.ts.map +1 -0
- package/dist/parsers/types.d.ts +27 -0
- package/dist/parsers/types.d.ts.map +1 -0
- package/dist/parsers.d.ts +1 -0
- package/dist/parsers.js +492 -0
- package/dist/parsers.js.map +1 -0
- package/dist/prompts/DeduplicationPrompt.d.ts +5 -0
- package/dist/prompts/DeduplicationPrompt.d.ts.map +1 -0
- package/dist/prompts/ExtractorPrompt.d.ts +6 -0
- package/dist/prompts/ExtractorPrompt.d.ts.map +1 -0
- package/dist/prompts/ParallelMergerPrompt.d.ts +5 -0
- package/dist/prompts/ParallelMergerPrompt.d.ts.map +1 -0
- package/dist/prompts/SequentialExtractorPrompt.d.ts +6 -0
- package/dist/prompts/SequentialExtractorPrompt.d.ts.map +1 -0
- package/dist/prompts/formatArtifacts.d.ts +3 -0
- package/dist/prompts/formatArtifacts.d.ts.map +1 -0
- package/dist/strategies/DoublePassAutoMergeStrategy.d.ts +23 -0
- package/dist/strategies/DoublePassAutoMergeStrategy.d.ts.map +1 -0
- package/dist/strategies/DoublePassStrategy.d.ts +22 -0
- package/dist/strategies/DoublePassStrategy.d.ts.map +1 -0
- package/dist/strategies/ParallelAutoMergeStrategy.d.ts +27 -0
- package/dist/strategies/ParallelAutoMergeStrategy.d.ts.map +1 -0
- package/dist/strategies/ParallelStrategy.d.ts +22 -0
- package/dist/strategies/ParallelStrategy.d.ts.map +1 -0
- package/dist/strategies/SequentialAutoMergeStrategy.d.ts +22 -0
- package/dist/strategies/SequentialAutoMergeStrategy.d.ts.map +1 -0
- package/dist/strategies/SequentialStrategy.d.ts +20 -0
- package/dist/strategies/SequentialStrategy.d.ts.map +1 -0
- package/dist/strategies/SimpleStrategy.d.ts +18 -0
- package/dist/strategies/SimpleStrategy.d.ts.map +1 -0
- package/dist/strategies/agent/AgentStrategy.d.ts +44 -0
- package/dist/strategies/agent/AgentStrategy.d.ts.map +1 -0
- package/dist/strategies/agent/AgentTools.d.ts +55 -0
- package/dist/strategies/agent/AgentTools.d.ts.map +1 -0
- package/dist/strategies/agent/ArtifactFilesystem.d.ts +51 -0
- package/dist/strategies/agent/ArtifactFilesystem.d.ts.map +1 -0
- package/dist/strategies/agent/index.d.ts +4 -0
- package/dist/strategies/agent/index.d.ts.map +1 -0
- package/dist/strategies/concurrency.d.ts +2 -0
- package/dist/strategies/concurrency.d.ts.map +1 -0
- package/{src/strategies/index.ts → dist/strategies/index.d.ts} +2 -0
- package/dist/strategies/index.d.ts.map +1 -0
- package/dist/strategies/utils.d.ts +39 -0
- package/dist/strategies/utils.d.ts.map +1 -0
- package/dist/strategies.d.ts +1 -0
- package/dist/strategies.js +3930 -0
- package/dist/strategies.js.map +1 -0
- package/dist/tokenization.d.ts +11 -0
- package/dist/tokenization.d.ts.map +1 -0
- package/dist/types.d.ts +178 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/validation/validator.d.ts +20 -0
- package/dist/validation/validator.d.ts.map +1 -0
- package/package.json +30 -14
- package/src/agent-cli-integration.test.ts +0 -47
- package/src/agent-export.test.ts +0 -17
- package/src/agent-tool-labels.test.ts +0 -50
- package/src/artifacts/AGENTS.md +0 -16
- package/src/artifacts/fileToArtifact.test.ts +0 -37
- package/src/artifacts/fileToArtifact.ts +0 -44
- package/src/artifacts/input.test.ts +0 -243
- package/src/artifacts/input.ts +0 -360
- package/src/artifacts/providers.test.ts +0 -19
- package/src/artifacts/urlToArtifact.test.ts +0 -23
- package/src/artifacts/urlToArtifact.ts +0 -19
- package/src/auth/AGENTS.md +0 -11
- package/src/auth/config.test.ts +0 -132
- package/src/auth/config.ts +0 -186
- package/src/auth/tokens.test.ts +0 -58
- package/src/auth/tokens.ts +0 -229
- package/src/chunking/AGENTS.md +0 -11
- package/src/chunking/ArtifactBatcher.test.ts +0 -22
- package/src/chunking/ArtifactBatcher.ts +0 -110
- package/src/chunking/ArtifactSplitter.test.ts +0 -38
- package/src/chunking/ArtifactSplitter.ts +0 -151
- package/src/debug/AGENTS.md +0 -79
- package/src/debug/logger.test.ts +0 -244
- package/src/debug/logger.ts +0 -211
- package/src/extract.test.ts +0 -22
- package/src/extract.ts +0 -150
- package/src/fields.test.ts +0 -681
- package/src/fields.ts +0 -246
- package/src/index.test.ts +0 -20
- package/src/index.ts +0 -110
- package/src/llm/AGENTS.md +0 -9
- package/src/llm/LLMClient.test.ts +0 -394
- package/src/llm/LLMClient.ts +0 -264
- package/src/llm/RetryingRunner.test.ts +0 -174
- package/src/llm/RetryingRunner.ts +0 -270
- package/src/llm/message.test.ts +0 -42
- package/src/llm/message.ts +0 -47
- package/src/llm/models.test.ts +0 -82
- package/src/llm/models.ts +0 -190
- package/src/llm/resolveModel.ts +0 -86
- package/src/merge/AGENTS.md +0 -6
- package/src/merge/Deduplicator.test.ts +0 -108
- package/src/merge/Deduplicator.ts +0 -45
- package/src/merge/SmartDataMerger.test.ts +0 -177
- package/src/merge/SmartDataMerger.ts +0 -56
- package/src/parsers/AGENTS.md +0 -58
- package/src/parsers/collect.test.ts +0 -56
- package/src/parsers/collect.ts +0 -31
- package/src/parsers/mime.test.ts +0 -91
- package/src/parsers/mime.ts +0 -137
- package/src/parsers/npm.ts +0 -26
- package/src/parsers/pdf.test.ts +0 -394
- package/src/parsers/pdf.ts +0 -194
- package/src/parsers/runner.test.ts +0 -95
- package/src/parsers/runner.ts +0 -177
- package/src/parsers/types.ts +0 -29
- package/src/prompts/AGENTS.md +0 -8
- package/src/prompts/DeduplicationPrompt.test.ts +0 -41
- package/src/prompts/DeduplicationPrompt.ts +0 -37
- package/src/prompts/ExtractorPrompt.test.ts +0 -21
- package/src/prompts/ExtractorPrompt.ts +0 -72
- package/src/prompts/ParallelMergerPrompt.test.ts +0 -8
- package/src/prompts/ParallelMergerPrompt.ts +0 -37
- package/src/prompts/SequentialExtractorPrompt.test.ts +0 -24
- package/src/prompts/SequentialExtractorPrompt.ts +0 -82
- package/src/prompts/formatArtifacts.test.ts +0 -39
- package/src/prompts/formatArtifacts.ts +0 -46
- package/src/strategies/AGENTS.md +0 -6
- package/src/strategies/DoublePassAutoMergeStrategy.test.ts +0 -53
- package/src/strategies/DoublePassAutoMergeStrategy.ts +0 -410
- package/src/strategies/DoublePassStrategy.test.ts +0 -48
- package/src/strategies/DoublePassStrategy.ts +0 -266
- package/src/strategies/ParallelAutoMergeStrategy.test.ts +0 -152
- package/src/strategies/ParallelAutoMergeStrategy.ts +0 -345
- package/src/strategies/ParallelStrategy.test.ts +0 -61
- package/src/strategies/ParallelStrategy.ts +0 -208
- package/src/strategies/SequentialAutoMergeStrategy.test.ts +0 -66
- package/src/strategies/SequentialAutoMergeStrategy.ts +0 -325
- package/src/strategies/SequentialStrategy.test.ts +0 -53
- package/src/strategies/SequentialStrategy.ts +0 -142
- package/src/strategies/SimpleStrategy.test.ts +0 -46
- package/src/strategies/SimpleStrategy.ts +0 -94
- package/src/strategies/concurrency.test.ts +0 -16
- package/src/strategies/concurrency.ts +0 -14
- package/src/strategies/index.test.ts +0 -20
- package/src/strategies/utils.test.ts +0 -76
- package/src/strategies/utils.ts +0 -95
- package/src/tokenization.test.ts +0 -119
- package/src/tokenization.ts +0 -71
- package/src/types.test.ts +0 -25
- package/src/types.ts +0 -174
- package/src/validation/AGENTS.md +0 -7
- package/src/validation/validator.test.ts +0 -204
- package/src/validation/validator.ts +0 -90
- package/tsconfig.json +0 -22
|
@@ -1,110 +0,0 @@
|
|
|
1
|
-
import type { Artifact } from "../types";
|
|
2
|
-
import type { DebugLogger } from "../debug/logger";
|
|
3
|
-
import {
|
|
4
|
-
countArtifactTokens,
|
|
5
|
-
countArtifactImages,
|
|
6
|
-
type TokenCountOptions,
|
|
7
|
-
} from "../tokenization";
|
|
8
|
-
import { splitArtifact } from "./ArtifactSplitter";
|
|
9
|
-
|
|
10
|
-
export type BatchOptions = TokenCountOptions & {
|
|
11
|
-
maxTokens: number;
|
|
12
|
-
maxImages?: number;
|
|
13
|
-
modelMaxTokens?: number;
|
|
14
|
-
debug?: DebugLogger;
|
|
15
|
-
};
|
|
16
|
-
|
|
17
|
-
export const batchArtifacts = (
|
|
18
|
-
artifacts: Artifact[],
|
|
19
|
-
options: BatchOptions
|
|
20
|
-
): Artifact[][] => {
|
|
21
|
-
const debug = options.debug;
|
|
22
|
-
const maxTokens = options.modelMaxTokens
|
|
23
|
-
? Math.min(options.maxTokens, options.modelMaxTokens)
|
|
24
|
-
: options.maxTokens;
|
|
25
|
-
|
|
26
|
-
// Log batching start
|
|
27
|
-
debug?.batchingStart({
|
|
28
|
-
totalArtifacts: artifacts.length,
|
|
29
|
-
maxTokens: options.maxTokens,
|
|
30
|
-
maxImages: options.maxImages,
|
|
31
|
-
modelMaxTokens: options.modelMaxTokens,
|
|
32
|
-
effectiveMaxTokens: maxTokens,
|
|
33
|
-
});
|
|
34
|
-
|
|
35
|
-
const batches: Artifact[][] = [];
|
|
36
|
-
let currentBatch: Artifact[] = [];
|
|
37
|
-
let currentTokens = 0;
|
|
38
|
-
let currentImages = 0;
|
|
39
|
-
|
|
40
|
-
for (const artifact of artifacts) {
|
|
41
|
-
const splitOptions: any = {
|
|
42
|
-
maxTokens,
|
|
43
|
-
debug,
|
|
44
|
-
};
|
|
45
|
-
if (options.maxImages !== undefined) splitOptions.maxImages = options.maxImages;
|
|
46
|
-
if (options.textTokenRatio !== undefined) splitOptions.textTokenRatio = options.textTokenRatio;
|
|
47
|
-
if (options.defaultImageTokens !== undefined) splitOptions.defaultImageTokens = options.defaultImageTokens;
|
|
48
|
-
|
|
49
|
-
const splits = splitArtifact(artifact, splitOptions);
|
|
50
|
-
|
|
51
|
-
for (const split of splits) {
|
|
52
|
-
const splitTokens = countArtifactTokens(split, options);
|
|
53
|
-
const splitImages = countArtifactImages(split);
|
|
54
|
-
|
|
55
|
-
const exceedsTokens =
|
|
56
|
-
currentBatch.length > 0 && currentTokens + splitTokens > maxTokens;
|
|
57
|
-
const exceedsImages =
|
|
58
|
-
options.maxImages !== undefined &&
|
|
59
|
-
currentBatch.length > 0 &&
|
|
60
|
-
currentImages + splitImages > options.maxImages;
|
|
61
|
-
|
|
62
|
-
if (exceedsTokens || exceedsImages) {
|
|
63
|
-
// Log batch creation
|
|
64
|
-
debug?.batchCreated({
|
|
65
|
-
batchIndex: batches.length,
|
|
66
|
-
artifactCount: currentBatch.length,
|
|
67
|
-
totalTokens: currentTokens,
|
|
68
|
-
totalImages: currentImages,
|
|
69
|
-
artifactIds: currentBatch.map(a => a.id),
|
|
70
|
-
});
|
|
71
|
-
|
|
72
|
-
batches.push(currentBatch);
|
|
73
|
-
currentBatch = [];
|
|
74
|
-
currentTokens = 0;
|
|
75
|
-
currentImages = 0;
|
|
76
|
-
}
|
|
77
|
-
|
|
78
|
-
currentBatch.push(split);
|
|
79
|
-
currentTokens += splitTokens;
|
|
80
|
-
currentImages += splitImages;
|
|
81
|
-
}
|
|
82
|
-
}
|
|
83
|
-
|
|
84
|
-
if (currentBatch.length > 0) {
|
|
85
|
-
// Log final batch
|
|
86
|
-
debug?.batchCreated({
|
|
87
|
-
batchIndex: batches.length,
|
|
88
|
-
artifactCount: currentBatch.length,
|
|
89
|
-
totalTokens: currentTokens,
|
|
90
|
-
totalImages: currentImages,
|
|
91
|
-
artifactIds: currentBatch.map(a => a.id),
|
|
92
|
-
});
|
|
93
|
-
batches.push(currentBatch);
|
|
94
|
-
}
|
|
95
|
-
|
|
96
|
-
// Log batching complete
|
|
97
|
-
debug?.batchingComplete({
|
|
98
|
-
totalBatches: batches.length,
|
|
99
|
-
batches: batches.map((batch, index) => ({
|
|
100
|
-
index,
|
|
101
|
-
artifactCount: batch.length,
|
|
102
|
-
tokens: batch.reduce((sum, a) => sum + (a.tokens ?? 0), 0),
|
|
103
|
-
images: batch.reduce((sum, a) =>
|
|
104
|
-
sum + a.contents.reduce((c, content) => c + (content.media?.length ?? 0), 0), 0
|
|
105
|
-
),
|
|
106
|
-
})),
|
|
107
|
-
});
|
|
108
|
-
|
|
109
|
-
return batches;
|
|
110
|
-
};
|
|
@@ -1,38 +0,0 @@
|
|
|
1
|
-
import { test, expect } from "bun:test";
|
|
2
|
-
import type { Artifact } from "../types";
|
|
3
|
-
import { splitArtifact } from "./ArtifactSplitter";
|
|
4
|
-
|
|
5
|
-
const baseArtifact = (text: string): Artifact => ({
|
|
6
|
-
id: "artifact-1",
|
|
7
|
-
type: "text",
|
|
8
|
-
raw: async () => Buffer.from(text),
|
|
9
|
-
contents: [{ text }],
|
|
10
|
-
});
|
|
11
|
-
|
|
12
|
-
test("splitArtifact splits large text into chunks", () => {
|
|
13
|
-
const artifact = baseArtifact("abcdefghijklmnopqrst");
|
|
14
|
-
const chunks = splitArtifact(artifact, { maxTokens: 2 });
|
|
15
|
-
|
|
16
|
-
expect(chunks.length).toBe(3);
|
|
17
|
-
expect(chunks[0]?.contents[0]?.text).toBe("abcdefgh");
|
|
18
|
-
expect(chunks[1]?.contents[0]?.text).toBe("ijklmnop");
|
|
19
|
-
expect(chunks[2]?.contents[0]?.text).toBe("qrst");
|
|
20
|
-
});
|
|
21
|
-
|
|
22
|
-
test("splitArtifact keeps media on first text chunk", () => {
|
|
23
|
-
const artifact: Artifact = {
|
|
24
|
-
id: "artifact-2",
|
|
25
|
-
type: "pdf",
|
|
26
|
-
raw: async () => Buffer.from(""),
|
|
27
|
-
contents: [
|
|
28
|
-
{
|
|
29
|
-
text: "abcdefghijklmnopqrst",
|
|
30
|
-
media: [{ type: "image", url: "https://example.com/x.png" }],
|
|
31
|
-
},
|
|
32
|
-
],
|
|
33
|
-
};
|
|
34
|
-
|
|
35
|
-
const chunks = splitArtifact(artifact, { maxTokens: 2 });
|
|
36
|
-
expect(chunks[0]?.contents[0]?.media?.length).toBe(1);
|
|
37
|
-
expect(chunks[1]?.contents[0]?.media).toBeUndefined();
|
|
38
|
-
});
|
|
@@ -1,151 +0,0 @@
|
|
|
1
|
-
import type { Artifact, ArtifactContent } from "../types";
|
|
2
|
-
import type { DebugLogger } from "../debug/logger";
|
|
3
|
-
import {
|
|
4
|
-
countContentTokens,
|
|
5
|
-
countArtifactImages,
|
|
6
|
-
countArtifactTokens,
|
|
7
|
-
estimateTextTokens,
|
|
8
|
-
type TokenCountOptions,
|
|
9
|
-
} from "../tokenization";
|
|
10
|
-
|
|
11
|
-
export type SplitOptions = TokenCountOptions & {
|
|
12
|
-
maxTokens: number;
|
|
13
|
-
maxImages?: number;
|
|
14
|
-
debug?: DebugLogger;
|
|
15
|
-
};
|
|
16
|
-
|
|
17
|
-
const splitTextIntoChunks = (
|
|
18
|
-
content: ArtifactContent,
|
|
19
|
-
maxTokens: number,
|
|
20
|
-
options?: TokenCountOptions,
|
|
21
|
-
debug?: DebugLogger,
|
|
22
|
-
artifactId?: string
|
|
23
|
-
): ArtifactContent[] => {
|
|
24
|
-
if (!content.text) {
|
|
25
|
-
return [content];
|
|
26
|
-
}
|
|
27
|
-
|
|
28
|
-
const totalTokens = estimateTextTokens(content.text, options);
|
|
29
|
-
if (totalTokens <= maxTokens) {
|
|
30
|
-
return [content];
|
|
31
|
-
}
|
|
32
|
-
|
|
33
|
-
const ratio = options?.textTokenRatio ?? 4;
|
|
34
|
-
const chunkSize = Math.max(1, maxTokens * ratio);
|
|
35
|
-
const chunks: ArtifactContent[] = [];
|
|
36
|
-
|
|
37
|
-
// Log text splitting
|
|
38
|
-
if (debug && artifactId) {
|
|
39
|
-
debug.chunkingSplit({
|
|
40
|
-
artifactId,
|
|
41
|
-
originalContentCount: 1,
|
|
42
|
-
splitContentCount: Math.ceil(content.text.length / chunkSize),
|
|
43
|
-
splitReason: "text_too_long",
|
|
44
|
-
originalTokens: totalTokens,
|
|
45
|
-
chunkSize,
|
|
46
|
-
});
|
|
47
|
-
}
|
|
48
|
-
|
|
49
|
-
for (let offset = 0; offset < content.text.length; offset += chunkSize) {
|
|
50
|
-
const text = content.text.slice(offset, offset + chunkSize);
|
|
51
|
-
chunks.push({
|
|
52
|
-
page: content.page,
|
|
53
|
-
text,
|
|
54
|
-
media: offset === 0 ? content.media : undefined,
|
|
55
|
-
});
|
|
56
|
-
}
|
|
57
|
-
|
|
58
|
-
return chunks;
|
|
59
|
-
};
|
|
60
|
-
|
|
61
|
-
export const splitArtifact = (
|
|
62
|
-
artifact: Artifact,
|
|
63
|
-
options: SplitOptions
|
|
64
|
-
): Artifact[] => {
|
|
65
|
-
const { maxTokens, maxImages, debug } = options;
|
|
66
|
-
const splitContents: ArtifactContent[] = [];
|
|
67
|
-
|
|
68
|
-
// Log chunking start
|
|
69
|
-
const totalTokens = countArtifactTokens(artifact, options);
|
|
70
|
-
debug?.chunkingStart({
|
|
71
|
-
artifactId: artifact.id,
|
|
72
|
-
totalTokens,
|
|
73
|
-
maxTokens,
|
|
74
|
-
maxImages,
|
|
75
|
-
});
|
|
76
|
-
|
|
77
|
-
for (const content of artifact.contents) {
|
|
78
|
-
splitContents.push(...splitTextIntoChunks(content, maxTokens, options, debug, artifact.id));
|
|
79
|
-
}
|
|
80
|
-
|
|
81
|
-
const chunks: Artifact[] = [];
|
|
82
|
-
let currentContents: ArtifactContent[] = [];
|
|
83
|
-
let currentTokens = 0;
|
|
84
|
-
let currentImages = 0;
|
|
85
|
-
|
|
86
|
-
for (const content of splitContents) {
|
|
87
|
-
const contentTokens = countContentTokens(content, options);
|
|
88
|
-
const contentImages = content.media?.length ?? 0;
|
|
89
|
-
|
|
90
|
-
const exceedsTokens =
|
|
91
|
-
currentContents.length > 0 && currentTokens + contentTokens > maxTokens;
|
|
92
|
-
const exceedsImages =
|
|
93
|
-
maxImages !== undefined &&
|
|
94
|
-
currentContents.length > 0 &&
|
|
95
|
-
currentImages + contentImages > maxImages;
|
|
96
|
-
|
|
97
|
-
if (exceedsTokens || exceedsImages) {
|
|
98
|
-
// Log chunk creation
|
|
99
|
-
if (debug) {
|
|
100
|
-
debug.chunkingSplit({
|
|
101
|
-
artifactId: artifact.id,
|
|
102
|
-
originalContentCount: splitContents.length,
|
|
103
|
-
splitContentCount: chunks.length + 1,
|
|
104
|
-
splitReason: exceedsTokens ? "content_limit" : "content_limit",
|
|
105
|
-
originalTokens: totalTokens,
|
|
106
|
-
chunkSize: maxTokens,
|
|
107
|
-
});
|
|
108
|
-
}
|
|
109
|
-
|
|
110
|
-
chunks.push({
|
|
111
|
-
...artifact,
|
|
112
|
-
id: `${artifact.id}:part:${chunks.length + 1}`,
|
|
113
|
-
contents: currentContents,
|
|
114
|
-
tokens: currentTokens,
|
|
115
|
-
});
|
|
116
|
-
currentContents = [];
|
|
117
|
-
currentTokens = 0;
|
|
118
|
-
currentImages = 0;
|
|
119
|
-
}
|
|
120
|
-
|
|
121
|
-
currentContents.push(content);
|
|
122
|
-
currentTokens += contentTokens;
|
|
123
|
-
currentImages += contentImages;
|
|
124
|
-
}
|
|
125
|
-
|
|
126
|
-
if (currentContents.length > 0) {
|
|
127
|
-
chunks.push({
|
|
128
|
-
...artifact,
|
|
129
|
-
id: `${artifact.id}:part:${chunks.length + 1}`,
|
|
130
|
-
contents: currentContents,
|
|
131
|
-
tokens: currentTokens,
|
|
132
|
-
});
|
|
133
|
-
}
|
|
134
|
-
|
|
135
|
-
if (chunks.length === 0) {
|
|
136
|
-
chunks.push({
|
|
137
|
-
...artifact,
|
|
138
|
-
id: `${artifact.id}:part:1`,
|
|
139
|
-
tokens: countArtifactTokens(artifact, options),
|
|
140
|
-
});
|
|
141
|
-
}
|
|
142
|
-
|
|
143
|
-
// Log chunking result
|
|
144
|
-
debug?.chunkingResult({
|
|
145
|
-
artifactId: artifact.id,
|
|
146
|
-
chunksCreated: chunks.length,
|
|
147
|
-
chunkSizes: chunks.map(c => c.tokens ?? 0),
|
|
148
|
-
});
|
|
149
|
-
|
|
150
|
-
return chunks;
|
|
151
|
-
};
|
package/src/debug/AGENTS.md
DELETED
|
@@ -1,79 +0,0 @@
|
|
|
1
|
-
# Debug Module
|
|
2
|
-
|
|
3
|
-
## Overview
|
|
4
|
-
|
|
5
|
-
The debug module provides comprehensive JSON logging for the Struktur extraction pipeline. When `--debug` flag is enabled via CLI, every operation is logged as single-line JSON to stderr.
|
|
6
|
-
|
|
7
|
-
## Key Files
|
|
8
|
-
|
|
9
|
-
- `logger.ts`: Core debug logger with structured logging functions for every pipeline stage.
|
|
10
|
-
|
|
11
|
-
## Debug Log Types
|
|
12
|
-
|
|
13
|
-
### CLI Initialization
|
|
14
|
-
- `cli_init`: CLI arguments and configuration
|
|
15
|
-
- `schema_loaded`: Schema source and size
|
|
16
|
-
- `artifacts_loaded`: Artifact count, types, tokens, images
|
|
17
|
-
- `model_resolved`: Model specification resolution
|
|
18
|
-
- `strategy_created`: Strategy selection with config
|
|
19
|
-
|
|
20
|
-
### Chunking
|
|
21
|
-
- `chunking_start`: Per-artifact chunking begins
|
|
22
|
-
- `chunking_split`: Text or content splits due to limits
|
|
23
|
-
- `chunking_result`: Final chunks created with sizes
|
|
24
|
-
|
|
25
|
-
### Batching
|
|
26
|
-
- `batching_start`: Batch creation parameters
|
|
27
|
-
- `batch_created`: Individual batch details
|
|
28
|
-
- `batching_complete`: Summary of all batches
|
|
29
|
-
|
|
30
|
-
### Strategy Execution
|
|
31
|
-
- `strategy_run_start`: Strategy begins with estimated steps
|
|
32
|
-
- `step`: Step progression through pipeline
|
|
33
|
-
- `progress`: Progress updates within steps
|
|
34
|
-
|
|
35
|
-
### LLM Calls
|
|
36
|
-
- `llm_call_start`: API call initiation with prompt sizes
|
|
37
|
-
- `prompt_system`: Full system prompt (verbose)
|
|
38
|
-
- `prompt_user`: Full user content (verbose)
|
|
39
|
-
- `llm_call_complete`: Call completion with tokens/timing
|
|
40
|
-
- `raw_response`: Raw LLM response data (verbose)
|
|
41
|
-
|
|
42
|
-
### Validation
|
|
43
|
-
- `validation_start`: Validation attempt begins
|
|
44
|
-
- `validation_success`: Validation passed
|
|
45
|
-
- `validation_failed`: Validation errors
|
|
46
|
-
- `retry`: Retry attempt triggered
|
|
47
|
-
|
|
48
|
-
### Merging
|
|
49
|
-
- `merge_start`: Merge operation begins
|
|
50
|
-
- `smart_merge_field`: Per-field merge operations
|
|
51
|
-
- `merge_complete`: Merge success/failure
|
|
52
|
-
|
|
53
|
-
### Deduplication
|
|
54
|
-
- `dedupe_start`: Deduplication begins
|
|
55
|
-
- `dedupe_complete`: Duplicates found and removed
|
|
56
|
-
|
|
57
|
-
### Results
|
|
58
|
-
- `token_usage`: Token consumption tracking
|
|
59
|
-
- `extraction_complete`: Final extraction status
|
|
60
|
-
|
|
61
|
-
## Usage
|
|
62
|
-
|
|
63
|
-
Enable via CLI:
|
|
64
|
-
```bash
|
|
65
|
-
struktur extract --debug -t "text to extract" -s schema.json
|
|
66
|
-
```
|
|
67
|
-
|
|
68
|
-
Debug logs are written to stderr as single-line JSON:
|
|
69
|
-
```json
|
|
70
|
-
{"timestamp":"2026-02-24T20:00:00.000Z","type":"cli_init","args":{"strategy":"simple"}}
|
|
71
|
-
```
|
|
72
|
-
|
|
73
|
-
## Design Notes
|
|
74
|
-
|
|
75
|
-
- All logs include ISO8601 timestamps
|
|
76
|
-
- Logs are single-line JSON for easy parsing
|
|
77
|
-
- Output goes to stderr to not interfere with stdout results
|
|
78
|
-
- The debug logger is passed through the entire pipeline via `ExtractionOptions.debug`
|
|
79
|
-
- When debug is disabled (default), all logging calls are no-ops
|
package/src/debug/logger.test.ts
DELETED
|
@@ -1,244 +0,0 @@
|
|
|
1
|
-
import { test, expect, beforeEach, afterEach } from "bun:test";
|
|
2
|
-
import { createDebugLogger } from "./logger";
|
|
3
|
-
|
|
4
|
-
let stderrOutput: string[];
|
|
5
|
-
const originalStderrWrite = process.stderr.write;
|
|
6
|
-
|
|
7
|
-
beforeEach(() => {
|
|
8
|
-
stderrOutput = [];
|
|
9
|
-
process.stderr.write = (chunk: unknown) => {
|
|
10
|
-
if (typeof chunk === "string") {
|
|
11
|
-
stderrOutput.push(chunk);
|
|
12
|
-
}
|
|
13
|
-
return true;
|
|
14
|
-
};
|
|
15
|
-
});
|
|
16
|
-
|
|
17
|
-
afterEach(() => {
|
|
18
|
-
process.stderr.write = originalStderrWrite;
|
|
19
|
-
});
|
|
20
|
-
|
|
21
|
-
test("createDebugLogger with enabled=false is a no-op", () => {
|
|
22
|
-
const logger = createDebugLogger(false);
|
|
23
|
-
logger.cliInit({ args: { test: true } });
|
|
24
|
-
expect(stderrOutput.length).toBe(0);
|
|
25
|
-
});
|
|
26
|
-
|
|
27
|
-
test("createDebugLogger with enabled=true logs to stderr", () => {
|
|
28
|
-
const logger = createDebugLogger(true);
|
|
29
|
-
logger.cliInit({ args: { test: true } });
|
|
30
|
-
expect(stderrOutput.length).toBe(1);
|
|
31
|
-
const parsed = JSON.parse(stderrOutput[0]!);
|
|
32
|
-
expect(parsed.type).toBe("cli_init");
|
|
33
|
-
expect(parsed.args).toEqual({ test: true });
|
|
34
|
-
expect(parsed.timestamp).toBeDefined();
|
|
35
|
-
});
|
|
36
|
-
|
|
37
|
-
test("cliInit logs correct type", () => {
|
|
38
|
-
const logger = createDebugLogger(true);
|
|
39
|
-
logger.cliInit({ args: { strategy: "simple" } });
|
|
40
|
-
const parsed = JSON.parse(stderrOutput[0]!);
|
|
41
|
-
expect(parsed.type).toBe("cli_init");
|
|
42
|
-
});
|
|
43
|
-
|
|
44
|
-
test("schemaLoaded logs source and size", () => {
|
|
45
|
-
const logger = createDebugLogger(true);
|
|
46
|
-
logger.schemaLoaded({ source: "file.json", schemaSize: 100 });
|
|
47
|
-
const parsed = JSON.parse(stderrOutput[0]!);
|
|
48
|
-
expect(parsed.type).toBe("schema_loaded");
|
|
49
|
-
expect(parsed.source).toBe("file.json");
|
|
50
|
-
expect(parsed.schemaSize).toBe(100);
|
|
51
|
-
});
|
|
52
|
-
|
|
53
|
-
test("artifactsLoaded logs artifact details", () => {
|
|
54
|
-
const logger = createDebugLogger(true);
|
|
55
|
-
logger.artifactsLoaded({
|
|
56
|
-
count: 2,
|
|
57
|
-
artifacts: [
|
|
58
|
-
{ id: "a1", type: "text", contentCount: 1, tokens: 10 },
|
|
59
|
-
{ id: "a2", type: "pdf", contentCount: 3 },
|
|
60
|
-
],
|
|
61
|
-
totalTokens: 1010,
|
|
62
|
-
totalImages: 2,
|
|
63
|
-
});
|
|
64
|
-
const parsed = JSON.parse(stderrOutput[0]!);
|
|
65
|
-
expect(parsed.type).toBe("artifacts_loaded");
|
|
66
|
-
expect(parsed.count).toBe(2);
|
|
67
|
-
expect(parsed.totalTokens).toBe(1010);
|
|
68
|
-
expect(parsed.totalImages).toBe(2);
|
|
69
|
-
});
|
|
70
|
-
|
|
71
|
-
test("chunkingStart logs chunking parameters", () => {
|
|
72
|
-
const logger = createDebugLogger(true);
|
|
73
|
-
logger.chunkingStart({
|
|
74
|
-
artifactId: "a1",
|
|
75
|
-
totalTokens: 100,
|
|
76
|
-
maxTokens: 50,
|
|
77
|
-
maxImages: 5,
|
|
78
|
-
});
|
|
79
|
-
const parsed = JSON.parse(stderrOutput[0]!);
|
|
80
|
-
expect(parsed.type).toBe("chunking_start");
|
|
81
|
-
expect(parsed.artifactId).toBe("a1");
|
|
82
|
-
expect(parsed.maxTokens).toBe(50);
|
|
83
|
-
});
|
|
84
|
-
|
|
85
|
-
test("llmCallStart logs call details", () => {
|
|
86
|
-
const logger = createDebugLogger(true);
|
|
87
|
-
logger.llmCallStart({
|
|
88
|
-
callId: "call-1",
|
|
89
|
-
model: "gpt-4",
|
|
90
|
-
schemaName: "extract",
|
|
91
|
-
systemLength: 100,
|
|
92
|
-
userLength: 200,
|
|
93
|
-
artifactCount: 3,
|
|
94
|
-
});
|
|
95
|
-
const parsed = JSON.parse(stderrOutput[0]!);
|
|
96
|
-
expect(parsed.type).toBe("llm_call_start");
|
|
97
|
-
expect(parsed.callId).toBe("call-1");
|
|
98
|
-
expect(parsed.artifactCount).toBe(3);
|
|
99
|
-
});
|
|
100
|
-
|
|
101
|
-
test("llmCallComplete logs success with duration", () => {
|
|
102
|
-
const logger = createDebugLogger(true);
|
|
103
|
-
logger.llmCallComplete({
|
|
104
|
-
callId: "call-1",
|
|
105
|
-
success: true,
|
|
106
|
-
inputTokens: 100,
|
|
107
|
-
outputTokens: 50,
|
|
108
|
-
totalTokens: 150,
|
|
109
|
-
durationMs: 1234,
|
|
110
|
-
});
|
|
111
|
-
const parsed = JSON.parse(stderrOutput[0]!);
|
|
112
|
-
expect(parsed.type).toBe("llm_call_complete");
|
|
113
|
-
expect(parsed.success).toBe(true);
|
|
114
|
-
expect(parsed.durationMs).toBe(1234);
|
|
115
|
-
});
|
|
116
|
-
|
|
117
|
-
test("llmCallComplete logs failure with error", () => {
|
|
118
|
-
const logger = createDebugLogger(true);
|
|
119
|
-
logger.llmCallComplete({
|
|
120
|
-
callId: "call-1",
|
|
121
|
-
success: false,
|
|
122
|
-
inputTokens: 100,
|
|
123
|
-
outputTokens: 0,
|
|
124
|
-
totalTokens: 100,
|
|
125
|
-
error: "API error",
|
|
126
|
-
});
|
|
127
|
-
const parsed = JSON.parse(stderrOutput[0]!);
|
|
128
|
-
expect(parsed.success).toBe(false);
|
|
129
|
-
expect(parsed.error).toBe("API error");
|
|
130
|
-
});
|
|
131
|
-
|
|
132
|
-
test("retry logs retry attempt", () => {
|
|
133
|
-
const logger = createDebugLogger(true);
|
|
134
|
-
logger.retry({
|
|
135
|
-
callId: "call-1",
|
|
136
|
-
attempt: 2,
|
|
137
|
-
maxAttempts: 3,
|
|
138
|
-
reason: "schema_validation_failed",
|
|
139
|
-
});
|
|
140
|
-
const parsed = JSON.parse(stderrOutput[0]!);
|
|
141
|
-
expect(parsed.type).toBe("retry");
|
|
142
|
-
expect(parsed.attempt).toBe(2);
|
|
143
|
-
expect(parsed.reason).toBe("schema_validation_failed");
|
|
144
|
-
});
|
|
145
|
-
|
|
146
|
-
test("validationStart logs validation attempt", () => {
|
|
147
|
-
const logger = createDebugLogger(true);
|
|
148
|
-
logger.validationStart({
|
|
149
|
-
callId: "call-1",
|
|
150
|
-
attempt: 1,
|
|
151
|
-
maxAttempts: 3,
|
|
152
|
-
strict: false,
|
|
153
|
-
});
|
|
154
|
-
const parsed = JSON.parse(stderrOutput[0]!);
|
|
155
|
-
expect(parsed.type).toBe("validation_start");
|
|
156
|
-
expect(parsed.strict).toBe(false);
|
|
157
|
-
});
|
|
158
|
-
|
|
159
|
-
test("validationSuccess logs successful validation", () => {
|
|
160
|
-
const logger = createDebugLogger(true);
|
|
161
|
-
logger.validationSuccess({ callId: "call-1", attempt: 1 });
|
|
162
|
-
const parsed = JSON.parse(stderrOutput[0]!);
|
|
163
|
-
expect(parsed.type).toBe("validation_success");
|
|
164
|
-
});
|
|
165
|
-
|
|
166
|
-
test("validationFailed logs validation errors", () => {
|
|
167
|
-
const logger = createDebugLogger(true);
|
|
168
|
-
logger.validationFailed({
|
|
169
|
-
callId: "call-1",
|
|
170
|
-
attempt: 1,
|
|
171
|
-
errors: [{ keyword: "required", message: "missing field" }],
|
|
172
|
-
});
|
|
173
|
-
const parsed = JSON.parse(stderrOutput[0]!);
|
|
174
|
-
expect(parsed.type).toBe("validation_failed");
|
|
175
|
-
expect(parsed.errors).toBeDefined();
|
|
176
|
-
});
|
|
177
|
-
|
|
178
|
-
test("mergeStart logs merge operation", () => {
|
|
179
|
-
const logger = createDebugLogger(true);
|
|
180
|
-
logger.mergeStart({
|
|
181
|
-
mergeId: "merge-1",
|
|
182
|
-
inputCount: 3,
|
|
183
|
-
strategy: "parallel",
|
|
184
|
-
});
|
|
185
|
-
const parsed = JSON.parse(stderrOutput[0]!);
|
|
186
|
-
expect(parsed.type).toBe("merge_start");
|
|
187
|
-
expect(parsed.inputCount).toBe(3);
|
|
188
|
-
});
|
|
189
|
-
|
|
190
|
-
test("mergeComplete logs merge result", () => {
|
|
191
|
-
const logger = createDebugLogger(true);
|
|
192
|
-
logger.mergeComplete({ mergeId: "merge-1", success: true });
|
|
193
|
-
const parsed = JSON.parse(stderrOutput[0]!);
|
|
194
|
-
expect(parsed.type).toBe("merge_complete");
|
|
195
|
-
expect(parsed.success).toBe(true);
|
|
196
|
-
});
|
|
197
|
-
|
|
198
|
-
test("dedupeStart logs deduplication start", () => {
|
|
199
|
-
const logger = createDebugLogger(true);
|
|
200
|
-
logger.dedupeStart({ dedupeId: "dedupe-1", itemCount: 10 });
|
|
201
|
-
const parsed = JSON.parse(stderrOutput[0]!);
|
|
202
|
-
expect(parsed.type).toBe("dedupe_start");
|
|
203
|
-
expect(parsed.itemCount).toBe(10);
|
|
204
|
-
});
|
|
205
|
-
|
|
206
|
-
test("dedupeComplete logs deduplication result", () => {
|
|
207
|
-
const logger = createDebugLogger(true);
|
|
208
|
-
logger.dedupeComplete({
|
|
209
|
-
dedupeId: "dedupe-1",
|
|
210
|
-
duplicatesFound: 3,
|
|
211
|
-
itemsRemoved: 3,
|
|
212
|
-
});
|
|
213
|
-
const parsed = JSON.parse(stderrOutput[0]!);
|
|
214
|
-
expect(parsed.type).toBe("dedupe_complete");
|
|
215
|
-
expect(parsed.duplicatesFound).toBe(3);
|
|
216
|
-
});
|
|
217
|
-
|
|
218
|
-
test("extractionComplete logs final result", () => {
|
|
219
|
-
const logger = createDebugLogger(true);
|
|
220
|
-
logger.extractionComplete({
|
|
221
|
-
success: true,
|
|
222
|
-
totalInputTokens: 100,
|
|
223
|
-
totalOutputTokens: 50,
|
|
224
|
-
totalTokens: 150,
|
|
225
|
-
});
|
|
226
|
-
const parsed = JSON.parse(stderrOutput[0]!);
|
|
227
|
-
expect(parsed.type).toBe("extraction_complete");
|
|
228
|
-
expect(parsed.success).toBe(true);
|
|
229
|
-
});
|
|
230
|
-
|
|
231
|
-
test("smartMergeField logs field merge operation", () => {
|
|
232
|
-
const logger = createDebugLogger(true);
|
|
233
|
-
logger.smartMergeField({
|
|
234
|
-
mergeId: "merge-1",
|
|
235
|
-
field: "items",
|
|
236
|
-
operation: "merge_arrays",
|
|
237
|
-
leftCount: 5,
|
|
238
|
-
rightCount: 3,
|
|
239
|
-
resultCount: 8,
|
|
240
|
-
});
|
|
241
|
-
const parsed = JSON.parse(stderrOutput[0]!);
|
|
242
|
-
expect(parsed.type).toBe("smart_merge_field");
|
|
243
|
-
expect(parsed.operation).toBe("merge_arrays");
|
|
244
|
-
});
|