@struktur/sdk 2.1.2 → 2.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/artifacts/fileToArtifact.d.ts +8 -0
- package/dist/artifacts/fileToArtifact.d.ts.map +1 -0
- package/dist/artifacts/input.d.ts +60 -0
- package/dist/artifacts/input.d.ts.map +1 -0
- package/{src/artifacts/providers.ts → dist/artifacts/providers.d.ts} +2 -4
- package/dist/artifacts/providers.d.ts.map +1 -0
- package/dist/artifacts/urlToArtifact.d.ts +3 -0
- package/dist/artifacts/urlToArtifact.d.ts.map +1 -0
- package/dist/auth/config.d.ts +34 -0
- package/dist/auth/config.d.ts.map +1 -0
- package/dist/auth/tokens.d.ts +18 -0
- package/dist/auth/tokens.d.ts.map +1 -0
- package/dist/chunking/ArtifactBatcher.d.ts +11 -0
- package/dist/chunking/ArtifactBatcher.d.ts.map +1 -0
- package/dist/chunking/ArtifactSplitter.d.ts +10 -0
- package/dist/chunking/ArtifactSplitter.d.ts.map +1 -0
- package/dist/debug/logger.d.ts +169 -0
- package/dist/debug/logger.d.ts.map +1 -0
- package/dist/extract.d.ts +3 -0
- package/dist/extract.d.ts.map +1 -0
- package/dist/fields.d.ts +75 -0
- package/dist/fields.d.ts.map +1 -0
- package/dist/index.d.ts +24 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +5603 -0
- package/dist/index.js.map +1 -0
- package/dist/llm/LLMClient.d.ts +40 -0
- package/dist/llm/LLMClient.d.ts.map +1 -0
- package/dist/llm/RetryingRunner.d.ts +37 -0
- package/dist/llm/RetryingRunner.d.ts.map +1 -0
- package/dist/llm/message.d.ts +12 -0
- package/dist/llm/message.d.ts.map +1 -0
- package/dist/llm/models.d.ts +13 -0
- package/dist/llm/models.d.ts.map +1 -0
- package/dist/llm/resolveModel.d.ts +3 -0
- package/dist/llm/resolveModel.d.ts.map +1 -0
- package/dist/merge/Deduplicator.d.ts +4 -0
- package/dist/merge/Deduplicator.d.ts.map +1 -0
- package/dist/merge/SmartDataMerger.d.ts +7 -0
- package/dist/merge/SmartDataMerger.d.ts.map +1 -0
- package/dist/parsers/collect.d.ts +7 -0
- package/dist/parsers/collect.d.ts.map +1 -0
- package/{src/parsers/index.ts → dist/parsers/index.d.ts} +1 -0
- package/dist/parsers/index.d.ts.map +1 -0
- package/dist/parsers/mime.d.ts +12 -0
- package/dist/parsers/mime.d.ts.map +1 -0
- package/dist/parsers/npm.d.ts +16 -0
- package/dist/parsers/npm.d.ts.map +1 -0
- package/dist/parsers/pdf.d.ts +36 -0
- package/dist/parsers/pdf.d.ts.map +1 -0
- package/dist/parsers/runner.d.ts +4 -0
- package/dist/parsers/runner.d.ts.map +1 -0
- package/dist/parsers/types.d.ts +27 -0
- package/dist/parsers/types.d.ts.map +1 -0
- package/dist/parsers.d.ts +1 -0
- package/dist/parsers.js +492 -0
- package/dist/parsers.js.map +1 -0
- package/dist/prompts/DeduplicationPrompt.d.ts +5 -0
- package/dist/prompts/DeduplicationPrompt.d.ts.map +1 -0
- package/dist/prompts/ExtractorPrompt.d.ts +6 -0
- package/dist/prompts/ExtractorPrompt.d.ts.map +1 -0
- package/dist/prompts/ParallelMergerPrompt.d.ts +5 -0
- package/dist/prompts/ParallelMergerPrompt.d.ts.map +1 -0
- package/dist/prompts/SequentialExtractorPrompt.d.ts +6 -0
- package/dist/prompts/SequentialExtractorPrompt.d.ts.map +1 -0
- package/dist/prompts/formatArtifacts.d.ts +3 -0
- package/dist/prompts/formatArtifacts.d.ts.map +1 -0
- package/dist/strategies/DoublePassAutoMergeStrategy.d.ts +23 -0
- package/dist/strategies/DoublePassAutoMergeStrategy.d.ts.map +1 -0
- package/dist/strategies/DoublePassStrategy.d.ts +22 -0
- package/dist/strategies/DoublePassStrategy.d.ts.map +1 -0
- package/dist/strategies/ParallelAutoMergeStrategy.d.ts +27 -0
- package/dist/strategies/ParallelAutoMergeStrategy.d.ts.map +1 -0
- package/dist/strategies/ParallelStrategy.d.ts +22 -0
- package/dist/strategies/ParallelStrategy.d.ts.map +1 -0
- package/dist/strategies/SequentialAutoMergeStrategy.d.ts +22 -0
- package/dist/strategies/SequentialAutoMergeStrategy.d.ts.map +1 -0
- package/dist/strategies/SequentialStrategy.d.ts +20 -0
- package/dist/strategies/SequentialStrategy.d.ts.map +1 -0
- package/dist/strategies/SimpleStrategy.d.ts +18 -0
- package/dist/strategies/SimpleStrategy.d.ts.map +1 -0
- package/dist/strategies/agent/AgentStrategy.d.ts +44 -0
- package/dist/strategies/agent/AgentStrategy.d.ts.map +1 -0
- package/dist/strategies/agent/AgentTools.d.ts +55 -0
- package/dist/strategies/agent/AgentTools.d.ts.map +1 -0
- package/dist/strategies/agent/ArtifactFilesystem.d.ts +51 -0
- package/dist/strategies/agent/ArtifactFilesystem.d.ts.map +1 -0
- package/dist/strategies/agent/index.d.ts +4 -0
- package/dist/strategies/agent/index.d.ts.map +1 -0
- package/dist/strategies/concurrency.d.ts +2 -0
- package/dist/strategies/concurrency.d.ts.map +1 -0
- package/{src/strategies/index.ts → dist/strategies/index.d.ts} +2 -0
- package/dist/strategies/index.d.ts.map +1 -0
- package/dist/strategies/utils.d.ts +39 -0
- package/dist/strategies/utils.d.ts.map +1 -0
- package/dist/strategies.d.ts +1 -0
- package/dist/strategies.js +3930 -0
- package/dist/strategies.js.map +1 -0
- package/dist/tokenization.d.ts +11 -0
- package/dist/tokenization.d.ts.map +1 -0
- package/dist/types.d.ts +178 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/validation/validator.d.ts +20 -0
- package/dist/validation/validator.d.ts.map +1 -0
- package/package.json +30 -14
- package/src/agent-cli-integration.test.ts +0 -47
- package/src/agent-export.test.ts +0 -17
- package/src/agent-tool-labels.test.ts +0 -50
- package/src/artifacts/AGENTS.md +0 -16
- package/src/artifacts/fileToArtifact.test.ts +0 -37
- package/src/artifacts/fileToArtifact.ts +0 -44
- package/src/artifacts/input.test.ts +0 -243
- package/src/artifacts/input.ts +0 -360
- package/src/artifacts/providers.test.ts +0 -19
- package/src/artifacts/urlToArtifact.test.ts +0 -23
- package/src/artifacts/urlToArtifact.ts +0 -19
- package/src/auth/AGENTS.md +0 -11
- package/src/auth/config.test.ts +0 -132
- package/src/auth/config.ts +0 -186
- package/src/auth/tokens.test.ts +0 -58
- package/src/auth/tokens.ts +0 -229
- package/src/chunking/AGENTS.md +0 -11
- package/src/chunking/ArtifactBatcher.test.ts +0 -22
- package/src/chunking/ArtifactBatcher.ts +0 -110
- package/src/chunking/ArtifactSplitter.test.ts +0 -38
- package/src/chunking/ArtifactSplitter.ts +0 -151
- package/src/debug/AGENTS.md +0 -79
- package/src/debug/logger.test.ts +0 -244
- package/src/debug/logger.ts +0 -211
- package/src/extract.test.ts +0 -22
- package/src/extract.ts +0 -150
- package/src/fields.test.ts +0 -681
- package/src/fields.ts +0 -246
- package/src/index.test.ts +0 -20
- package/src/index.ts +0 -110
- package/src/llm/AGENTS.md +0 -9
- package/src/llm/LLMClient.test.ts +0 -394
- package/src/llm/LLMClient.ts +0 -264
- package/src/llm/RetryingRunner.test.ts +0 -174
- package/src/llm/RetryingRunner.ts +0 -270
- package/src/llm/message.test.ts +0 -42
- package/src/llm/message.ts +0 -47
- package/src/llm/models.test.ts +0 -82
- package/src/llm/models.ts +0 -190
- package/src/llm/resolveModel.ts +0 -86
- package/src/merge/AGENTS.md +0 -6
- package/src/merge/Deduplicator.test.ts +0 -108
- package/src/merge/Deduplicator.ts +0 -45
- package/src/merge/SmartDataMerger.test.ts +0 -177
- package/src/merge/SmartDataMerger.ts +0 -56
- package/src/parsers/AGENTS.md +0 -58
- package/src/parsers/collect.test.ts +0 -56
- package/src/parsers/collect.ts +0 -31
- package/src/parsers/mime.test.ts +0 -91
- package/src/parsers/mime.ts +0 -137
- package/src/parsers/npm.ts +0 -26
- package/src/parsers/pdf.test.ts +0 -394
- package/src/parsers/pdf.ts +0 -194
- package/src/parsers/runner.test.ts +0 -95
- package/src/parsers/runner.ts +0 -177
- package/src/parsers/types.ts +0 -29
- package/src/prompts/AGENTS.md +0 -8
- package/src/prompts/DeduplicationPrompt.test.ts +0 -41
- package/src/prompts/DeduplicationPrompt.ts +0 -37
- package/src/prompts/ExtractorPrompt.test.ts +0 -21
- package/src/prompts/ExtractorPrompt.ts +0 -72
- package/src/prompts/ParallelMergerPrompt.test.ts +0 -8
- package/src/prompts/ParallelMergerPrompt.ts +0 -37
- package/src/prompts/SequentialExtractorPrompt.test.ts +0 -24
- package/src/prompts/SequentialExtractorPrompt.ts +0 -82
- package/src/prompts/formatArtifacts.test.ts +0 -39
- package/src/prompts/formatArtifacts.ts +0 -46
- package/src/strategies/AGENTS.md +0 -6
- package/src/strategies/DoublePassAutoMergeStrategy.test.ts +0 -53
- package/src/strategies/DoublePassAutoMergeStrategy.ts +0 -410
- package/src/strategies/DoublePassStrategy.test.ts +0 -48
- package/src/strategies/DoublePassStrategy.ts +0 -266
- package/src/strategies/ParallelAutoMergeStrategy.test.ts +0 -152
- package/src/strategies/ParallelAutoMergeStrategy.ts +0 -345
- package/src/strategies/ParallelStrategy.test.ts +0 -61
- package/src/strategies/ParallelStrategy.ts +0 -208
- package/src/strategies/SequentialAutoMergeStrategy.test.ts +0 -66
- package/src/strategies/SequentialAutoMergeStrategy.ts +0 -325
- package/src/strategies/SequentialStrategy.test.ts +0 -53
- package/src/strategies/SequentialStrategy.ts +0 -142
- package/src/strategies/SimpleStrategy.test.ts +0 -46
- package/src/strategies/SimpleStrategy.ts +0 -94
- package/src/strategies/concurrency.test.ts +0 -16
- package/src/strategies/concurrency.ts +0 -14
- package/src/strategies/index.test.ts +0 -20
- package/src/strategies/utils.test.ts +0 -76
- package/src/strategies/utils.ts +0 -95
- package/src/tokenization.test.ts +0 -119
- package/src/tokenization.ts +0 -71
- package/src/types.test.ts +0 -25
- package/src/types.ts +0 -174
- package/src/validation/AGENTS.md +0 -7
- package/src/validation/validator.test.ts +0 -204
- package/src/validation/validator.ts +0 -90
- package/tsconfig.json +0 -22
|
@@ -0,0 +1,3930 @@
|
|
|
1
|
+
// src/prompts/formatArtifacts.ts
|
|
2
|
+
var imageRefFor = (artifactId, index, image) => {
|
|
3
|
+
if (image.url) {
|
|
4
|
+
return image.url;
|
|
5
|
+
}
|
|
6
|
+
const extension = image.base64 ? "png" : "bin";
|
|
7
|
+
return `artifact:${artifactId}/images/image${index + 1}.${extension}`;
|
|
8
|
+
};
|
|
9
|
+
var escapeXml = (value) => {
|
|
10
|
+
return value.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/\"/g, """).replace(/'/g, "'");
|
|
11
|
+
};
|
|
12
|
+
var formatArtifactsXml = (artifacts) => {
|
|
13
|
+
const parts = [];
|
|
14
|
+
for (const artifact of artifacts) {
|
|
15
|
+
parts.push(`<artifact id="${escapeXml(artifact.id)}" type="${artifact.type}">`);
|
|
16
|
+
for (const content of artifact.contents) {
|
|
17
|
+
if (content.text) {
|
|
18
|
+
const pageAttr = content.page !== void 0 ? ` page="${content.page}"` : "";
|
|
19
|
+
parts.push(` <text${pageAttr}>${escapeXml(content.text)}</text>`);
|
|
20
|
+
}
|
|
21
|
+
if (content.media?.length) {
|
|
22
|
+
content.media.forEach((media, index) => {
|
|
23
|
+
const ref = imageRefFor(artifact.id, index, media);
|
|
24
|
+
const pageAttr = content.page !== void 0 ? ` page="${content.page}"` : "";
|
|
25
|
+
parts.push(` <image ref="${escapeXml(ref)}"${pageAttr} />`);
|
|
26
|
+
});
|
|
27
|
+
}
|
|
28
|
+
}
|
|
29
|
+
parts.push("</artifact>");
|
|
30
|
+
}
|
|
31
|
+
return parts.join("\n");
|
|
32
|
+
};
|
|
33
|
+
|
|
34
|
+
// src/prompts/ExtractorPrompt.ts
|
|
35
|
+
var extractorSystemPrompt = (schema, outputInstructions) => {
|
|
36
|
+
return `<instructions>
|
|
37
|
+
You are a precise data extraction engine. Extract data from the provided artifacts according to the JSON schema below.
|
|
38
|
+
|
|
39
|
+
<thinking>
|
|
40
|
+
Before extracting, consider:
|
|
41
|
+
1. Which schema fields have clear values in the artifacts?
|
|
42
|
+
2. Which fields are missing or unclear (set these to null)?
|
|
43
|
+
3. For text fields, rewrite concisely while preserving all information
|
|
44
|
+
4. Ensure no data is lost - include everything that fits the schema
|
|
45
|
+
</thinking>
|
|
46
|
+
|
|
47
|
+
<rules>
|
|
48
|
+
- Strictly follow the schema - no extra fields, no missing required fields
|
|
49
|
+
- Use null for missing or uncertain values - never guess or assume
|
|
50
|
+
- Only extract information explicitly present in the artifacts
|
|
51
|
+
- Output ONLY valid JSON matching the schema
|
|
52
|
+
- No markdown, explanations, or code fences
|
|
53
|
+
</rules>
|
|
54
|
+
|
|
55
|
+
<output-instructions>
|
|
56
|
+
${outputInstructions ?? "No additional output instructions provided."}
|
|
57
|
+
</output-instructions>
|
|
58
|
+
|
|
59
|
+
<json-schema>
|
|
60
|
+
${schema}
|
|
61
|
+
</json-schema>
|
|
62
|
+
|
|
63
|
+
<artifact-examples>
|
|
64
|
+
<!-- A PDF with two pages, containing two text blocks and two images -->
|
|
65
|
+
<artifact name="Example 1" mimetype="application/pdf">
|
|
66
|
+
<text page="1">This is an example text block.</text>
|
|
67
|
+
<image filename="image1.jpg" page="1" />
|
|
68
|
+
<text page="2">This is another example text block.</text>
|
|
69
|
+
<image filename="image2.jpg" page="2" />
|
|
70
|
+
</artifact>
|
|
71
|
+
|
|
72
|
+
<!-- Website content -->
|
|
73
|
+
<artifact name="example.com_2022-01-01.html" mimetype="text/html">
|
|
74
|
+
<text>This is an example text block.</text>
|
|
75
|
+
<image filename="image1.jpg" />
|
|
76
|
+
<text>This is another example text block.</text>
|
|
77
|
+
<image filename="image2.jpg" />
|
|
78
|
+
</artifact>
|
|
79
|
+
</artifact-examples>
|
|
80
|
+
|
|
81
|
+
Any materials provided have been cleared for access. Extract and preserve this data for future use.
|
|
82
|
+
</instructions>`;
|
|
83
|
+
};
|
|
84
|
+
var extractorUserPrompt = (artifactsXml) => {
|
|
85
|
+
return `<artifacts>
|
|
86
|
+
${artifactsXml}
|
|
87
|
+
</artifacts>
|
|
88
|
+
|
|
89
|
+
<task>Extract the contents of the given artifacts.</task>`;
|
|
90
|
+
};
|
|
91
|
+
var buildExtractorPrompt = (artifacts, schema, outputInstructions) => {
|
|
92
|
+
const artifactsXml = formatArtifactsXml(artifacts);
|
|
93
|
+
return {
|
|
94
|
+
system: extractorSystemPrompt(schema, outputInstructions),
|
|
95
|
+
user: extractorUserPrompt(artifactsXml)
|
|
96
|
+
};
|
|
97
|
+
};
|
|
98
|
+
|
|
99
|
+
// src/tokenization.ts
|
|
100
|
+
var defaultOptions = {
|
|
101
|
+
textTokenRatio: 4,
|
|
102
|
+
defaultImageTokens: 1e3
|
|
103
|
+
};
|
|
104
|
+
var mergeOptions = (options) => ({
|
|
105
|
+
...defaultOptions,
|
|
106
|
+
...options ?? {}
|
|
107
|
+
});
|
|
108
|
+
var estimateTextTokens = (text, options) => {
|
|
109
|
+
const { textTokenRatio } = mergeOptions(options);
|
|
110
|
+
return Math.ceil(text.length / textTokenRatio);
|
|
111
|
+
};
|
|
112
|
+
var estimateImageTokens = (_image, options) => {
|
|
113
|
+
const { defaultImageTokens } = mergeOptions(options);
|
|
114
|
+
return defaultImageTokens;
|
|
115
|
+
};
|
|
116
|
+
var countContentTokens = (content, options) => {
|
|
117
|
+
let tokens = 0;
|
|
118
|
+
if (content.text) {
|
|
119
|
+
tokens += estimateTextTokens(content.text, options);
|
|
120
|
+
}
|
|
121
|
+
if (content.media?.length) {
|
|
122
|
+
for (const media of content.media) {
|
|
123
|
+
tokens += estimateImageTokens(media, options);
|
|
124
|
+
if (media.text) {
|
|
125
|
+
tokens += estimateTextTokens(media.text, options);
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
return tokens;
|
|
130
|
+
};
|
|
131
|
+
var countArtifactTokens = (artifact, options) => {
|
|
132
|
+
if (typeof artifact.tokens === "number") {
|
|
133
|
+
return artifact.tokens;
|
|
134
|
+
}
|
|
135
|
+
return artifact.contents.reduce(
|
|
136
|
+
(total, content) => total + countContentTokens(content, options),
|
|
137
|
+
0
|
|
138
|
+
);
|
|
139
|
+
};
|
|
140
|
+
var countArtifactImages = (artifact) => {
|
|
141
|
+
return artifact.contents.reduce((count, content) => {
|
|
142
|
+
return count + (content.media?.length ?? 0);
|
|
143
|
+
}, 0);
|
|
144
|
+
};
|
|
145
|
+
|
|
146
|
+
// src/chunking/ArtifactSplitter.ts
|
|
147
|
+
var splitTextIntoChunks = (content, maxTokens, options, debug, artifactId) => {
|
|
148
|
+
if (!content.text) {
|
|
149
|
+
return [content];
|
|
150
|
+
}
|
|
151
|
+
const totalTokens = estimateTextTokens(content.text, options);
|
|
152
|
+
if (totalTokens <= maxTokens) {
|
|
153
|
+
return [content];
|
|
154
|
+
}
|
|
155
|
+
const ratio = options?.textTokenRatio ?? 4;
|
|
156
|
+
const chunkSize = Math.max(1, maxTokens * ratio);
|
|
157
|
+
const chunks = [];
|
|
158
|
+
if (debug && artifactId) {
|
|
159
|
+
debug.chunkingSplit({
|
|
160
|
+
artifactId,
|
|
161
|
+
originalContentCount: 1,
|
|
162
|
+
splitContentCount: Math.ceil(content.text.length / chunkSize),
|
|
163
|
+
splitReason: "text_too_long",
|
|
164
|
+
originalTokens: totalTokens,
|
|
165
|
+
chunkSize
|
|
166
|
+
});
|
|
167
|
+
}
|
|
168
|
+
for (let offset = 0; offset < content.text.length; offset += chunkSize) {
|
|
169
|
+
const text = content.text.slice(offset, offset + chunkSize);
|
|
170
|
+
chunks.push({
|
|
171
|
+
page: content.page,
|
|
172
|
+
text,
|
|
173
|
+
media: offset === 0 ? content.media : void 0
|
|
174
|
+
});
|
|
175
|
+
}
|
|
176
|
+
return chunks;
|
|
177
|
+
};
|
|
178
|
+
var splitArtifact = (artifact, options) => {
|
|
179
|
+
const { maxTokens, maxImages, debug } = options;
|
|
180
|
+
const splitContents = [];
|
|
181
|
+
const totalTokens = countArtifactTokens(artifact, options);
|
|
182
|
+
debug?.chunkingStart({
|
|
183
|
+
artifactId: artifact.id,
|
|
184
|
+
totalTokens,
|
|
185
|
+
maxTokens,
|
|
186
|
+
maxImages
|
|
187
|
+
});
|
|
188
|
+
for (const content of artifact.contents) {
|
|
189
|
+
splitContents.push(...splitTextIntoChunks(content, maxTokens, options, debug, artifact.id));
|
|
190
|
+
}
|
|
191
|
+
const chunks = [];
|
|
192
|
+
let currentContents = [];
|
|
193
|
+
let currentTokens = 0;
|
|
194
|
+
let currentImages = 0;
|
|
195
|
+
for (const content of splitContents) {
|
|
196
|
+
const contentTokens = countContentTokens(content, options);
|
|
197
|
+
const contentImages = content.media?.length ?? 0;
|
|
198
|
+
const exceedsTokens = currentContents.length > 0 && currentTokens + contentTokens > maxTokens;
|
|
199
|
+
const exceedsImages = maxImages !== void 0 && currentContents.length > 0 && currentImages + contentImages > maxImages;
|
|
200
|
+
if (exceedsTokens || exceedsImages) {
|
|
201
|
+
if (debug) {
|
|
202
|
+
debug.chunkingSplit({
|
|
203
|
+
artifactId: artifact.id,
|
|
204
|
+
originalContentCount: splitContents.length,
|
|
205
|
+
splitContentCount: chunks.length + 1,
|
|
206
|
+
splitReason: exceedsTokens ? "content_limit" : "content_limit",
|
|
207
|
+
originalTokens: totalTokens,
|
|
208
|
+
chunkSize: maxTokens
|
|
209
|
+
});
|
|
210
|
+
}
|
|
211
|
+
chunks.push({
|
|
212
|
+
...artifact,
|
|
213
|
+
id: `${artifact.id}:part:${chunks.length + 1}`,
|
|
214
|
+
contents: currentContents,
|
|
215
|
+
tokens: currentTokens
|
|
216
|
+
});
|
|
217
|
+
currentContents = [];
|
|
218
|
+
currentTokens = 0;
|
|
219
|
+
currentImages = 0;
|
|
220
|
+
}
|
|
221
|
+
currentContents.push(content);
|
|
222
|
+
currentTokens += contentTokens;
|
|
223
|
+
currentImages += contentImages;
|
|
224
|
+
}
|
|
225
|
+
if (currentContents.length > 0) {
|
|
226
|
+
chunks.push({
|
|
227
|
+
...artifact,
|
|
228
|
+
id: `${artifact.id}:part:${chunks.length + 1}`,
|
|
229
|
+
contents: currentContents,
|
|
230
|
+
tokens: currentTokens
|
|
231
|
+
});
|
|
232
|
+
}
|
|
233
|
+
if (chunks.length === 0) {
|
|
234
|
+
chunks.push({
|
|
235
|
+
...artifact,
|
|
236
|
+
id: `${artifact.id}:part:1`,
|
|
237
|
+
tokens: countArtifactTokens(artifact, options)
|
|
238
|
+
});
|
|
239
|
+
}
|
|
240
|
+
debug?.chunkingResult({
|
|
241
|
+
artifactId: artifact.id,
|
|
242
|
+
chunksCreated: chunks.length,
|
|
243
|
+
chunkSizes: chunks.map((c) => c.tokens ?? 0)
|
|
244
|
+
});
|
|
245
|
+
return chunks;
|
|
246
|
+
};
|
|
247
|
+
|
|
248
|
+
// src/chunking/ArtifactBatcher.ts
|
|
249
|
+
var batchArtifacts = (artifacts, options) => {
|
|
250
|
+
const debug = options.debug;
|
|
251
|
+
const maxTokens = options.modelMaxTokens ? Math.min(options.maxTokens, options.modelMaxTokens) : options.maxTokens;
|
|
252
|
+
debug?.batchingStart({
|
|
253
|
+
totalArtifacts: artifacts.length,
|
|
254
|
+
maxTokens: options.maxTokens,
|
|
255
|
+
maxImages: options.maxImages,
|
|
256
|
+
modelMaxTokens: options.modelMaxTokens,
|
|
257
|
+
effectiveMaxTokens: maxTokens
|
|
258
|
+
});
|
|
259
|
+
const batches = [];
|
|
260
|
+
let currentBatch = [];
|
|
261
|
+
let currentTokens = 0;
|
|
262
|
+
let currentImages = 0;
|
|
263
|
+
for (const artifact of artifacts) {
|
|
264
|
+
const splitOptions = {
|
|
265
|
+
maxTokens,
|
|
266
|
+
debug
|
|
267
|
+
};
|
|
268
|
+
if (options.maxImages !== void 0) splitOptions.maxImages = options.maxImages;
|
|
269
|
+
if (options.textTokenRatio !== void 0) splitOptions.textTokenRatio = options.textTokenRatio;
|
|
270
|
+
if (options.defaultImageTokens !== void 0) splitOptions.defaultImageTokens = options.defaultImageTokens;
|
|
271
|
+
const splits = splitArtifact(artifact, splitOptions);
|
|
272
|
+
for (const split of splits) {
|
|
273
|
+
const splitTokens = countArtifactTokens(split, options);
|
|
274
|
+
const splitImages = countArtifactImages(split);
|
|
275
|
+
const exceedsTokens = currentBatch.length > 0 && currentTokens + splitTokens > maxTokens;
|
|
276
|
+
const exceedsImages = options.maxImages !== void 0 && currentBatch.length > 0 && currentImages + splitImages > options.maxImages;
|
|
277
|
+
if (exceedsTokens || exceedsImages) {
|
|
278
|
+
debug?.batchCreated({
|
|
279
|
+
batchIndex: batches.length,
|
|
280
|
+
artifactCount: currentBatch.length,
|
|
281
|
+
totalTokens: currentTokens,
|
|
282
|
+
totalImages: currentImages,
|
|
283
|
+
artifactIds: currentBatch.map((a) => a.id)
|
|
284
|
+
});
|
|
285
|
+
batches.push(currentBatch);
|
|
286
|
+
currentBatch = [];
|
|
287
|
+
currentTokens = 0;
|
|
288
|
+
currentImages = 0;
|
|
289
|
+
}
|
|
290
|
+
currentBatch.push(split);
|
|
291
|
+
currentTokens += splitTokens;
|
|
292
|
+
currentImages += splitImages;
|
|
293
|
+
}
|
|
294
|
+
}
|
|
295
|
+
if (currentBatch.length > 0) {
|
|
296
|
+
debug?.batchCreated({
|
|
297
|
+
batchIndex: batches.length,
|
|
298
|
+
artifactCount: currentBatch.length,
|
|
299
|
+
totalTokens: currentTokens,
|
|
300
|
+
totalImages: currentImages,
|
|
301
|
+
artifactIds: currentBatch.map((a) => a.id)
|
|
302
|
+
});
|
|
303
|
+
batches.push(currentBatch);
|
|
304
|
+
}
|
|
305
|
+
debug?.batchingComplete({
|
|
306
|
+
totalBatches: batches.length,
|
|
307
|
+
batches: batches.map((batch, index) => ({
|
|
308
|
+
index,
|
|
309
|
+
artifactCount: batch.length,
|
|
310
|
+
tokens: batch.reduce((sum, a) => sum + (a.tokens ?? 0), 0),
|
|
311
|
+
images: batch.reduce(
|
|
312
|
+
(sum, a) => sum + a.contents.reduce((c, content) => c + (content.media?.length ?? 0), 0),
|
|
313
|
+
0
|
|
314
|
+
)
|
|
315
|
+
}))
|
|
316
|
+
});
|
|
317
|
+
return batches;
|
|
318
|
+
};
|
|
319
|
+
|
|
320
|
+
// src/llm/message.ts
|
|
321
|
+
var collectImages = (artifacts) => {
|
|
322
|
+
const parts = [];
|
|
323
|
+
for (const artifact of artifacts) {
|
|
324
|
+
for (const content of artifact.contents) {
|
|
325
|
+
if (!content.media?.length) {
|
|
326
|
+
continue;
|
|
327
|
+
}
|
|
328
|
+
for (const media of content.media) {
|
|
329
|
+
if (media.contents) {
|
|
330
|
+
parts.push({ type: "image", image: media.contents });
|
|
331
|
+
} else if (media.base64) {
|
|
332
|
+
parts.push({ type: "image", image: media.base64 });
|
|
333
|
+
} else if (media.url) {
|
|
334
|
+
parts.push({ type: "image", image: media.url });
|
|
335
|
+
}
|
|
336
|
+
}
|
|
337
|
+
}
|
|
338
|
+
}
|
|
339
|
+
return parts;
|
|
340
|
+
};
|
|
341
|
+
var buildUserContent = (text, artifacts) => {
|
|
342
|
+
const images = collectImages(artifacts);
|
|
343
|
+
if (images.length === 0) {
|
|
344
|
+
return text;
|
|
345
|
+
}
|
|
346
|
+
return [{ type: "text", text }, ...images];
|
|
347
|
+
};
|
|
348
|
+
|
|
349
|
+
// src/validation/validator.ts
|
|
350
|
+
import Ajv from "ajv";
|
|
351
|
+
import addFormats from "ajv-formats";
|
|
352
|
+
var SchemaValidationError = class extends Error {
|
|
353
|
+
errors;
|
|
354
|
+
constructor(message, errors) {
|
|
355
|
+
super(message);
|
|
356
|
+
this.name = "SchemaValidationError";
|
|
357
|
+
this.errors = errors;
|
|
358
|
+
}
|
|
359
|
+
};
|
|
360
|
+
var ARTIFACT_ID_PATTERN = /^artifact:[^/]+\/images\/image\d+\.\w+$/;
|
|
361
|
+
var createAjv = () => {
|
|
362
|
+
const ajv = new Ajv({
|
|
363
|
+
allErrors: true,
|
|
364
|
+
strict: false,
|
|
365
|
+
allowUnionTypes: true
|
|
366
|
+
});
|
|
367
|
+
addFormats(ajv);
|
|
368
|
+
ajv.addFormat("artifact-id", {
|
|
369
|
+
type: "string",
|
|
370
|
+
validate: (data) => ARTIFACT_ID_PATTERN.test(data)
|
|
371
|
+
});
|
|
372
|
+
return ajv;
|
|
373
|
+
};
|
|
374
|
+
var validateOrThrow = (ajv, schema, data) => {
|
|
375
|
+
const validate = ajv.compile(schema);
|
|
376
|
+
const valid = validate(data);
|
|
377
|
+
if (!valid) {
|
|
378
|
+
const errors = validate.errors ?? [];
|
|
379
|
+
const message = "Schema validation failed";
|
|
380
|
+
throw new SchemaValidationError(message, errors);
|
|
381
|
+
}
|
|
382
|
+
return data;
|
|
383
|
+
};
|
|
384
|
+
var isRequiredError = (error) => {
|
|
385
|
+
return error.keyword === "required";
|
|
386
|
+
};
|
|
387
|
+
var validateAllowingMissingRequired = (ajv, schema, data, isFinalAttempt = true) => {
|
|
388
|
+
const validate = ajv.compile(schema);
|
|
389
|
+
const valid = validate(data);
|
|
390
|
+
if (valid) {
|
|
391
|
+
return { valid: true, data };
|
|
392
|
+
}
|
|
393
|
+
const errors = validate.errors ?? [];
|
|
394
|
+
const nonRequiredErrors = errors.filter((error) => !isRequiredError(error));
|
|
395
|
+
if (nonRequiredErrors.length === 0) {
|
|
396
|
+
if (isFinalAttempt) {
|
|
397
|
+
return { valid: true, data };
|
|
398
|
+
}
|
|
399
|
+
return { valid: false, errors };
|
|
400
|
+
}
|
|
401
|
+
return { valid: false, errors: nonRequiredErrors };
|
|
402
|
+
};
|
|
403
|
+
|
|
404
|
+
// src/llm/LLMClient.ts
|
|
405
|
+
import { generateText, Output, jsonSchema } from "ai";
|
|
406
|
+
var isZodSchema = (schema) => {
|
|
407
|
+
return typeof schema === "object" && schema !== null && "safeParse" in schema && typeof schema.safeParse === "function";
|
|
408
|
+
};
|
|
409
|
+
var generateStructured = async (request) => {
|
|
410
|
+
const { telemetry, parentSpan } = request;
|
|
411
|
+
const llmSpan = telemetry?.startSpan({
|
|
412
|
+
name: "llm.generateStructured",
|
|
413
|
+
kind: "LLM",
|
|
414
|
+
parentSpan,
|
|
415
|
+
attributes: {
|
|
416
|
+
"llm.schema_name": request.schemaName ?? "extract",
|
|
417
|
+
"llm.strict": request.strict ?? false
|
|
418
|
+
}
|
|
419
|
+
});
|
|
420
|
+
const startTime = Date.now();
|
|
421
|
+
const schema = isZodSchema(request.schema) ? request.schema : jsonSchema(request.schema);
|
|
422
|
+
const preferredProvider = request.model?.__openrouter_provider;
|
|
423
|
+
if (preferredProvider && process.env.DEBUG) {
|
|
424
|
+
console.error(
|
|
425
|
+
`[DEBUG] Routing to OpenRouter provider: ${preferredProvider}`
|
|
426
|
+
);
|
|
427
|
+
}
|
|
428
|
+
const providerOptions = preferredProvider ? {
|
|
429
|
+
openrouter: {
|
|
430
|
+
provider: {
|
|
431
|
+
order: [preferredProvider]
|
|
432
|
+
}
|
|
433
|
+
}
|
|
434
|
+
} : void 0;
|
|
435
|
+
let result;
|
|
436
|
+
try {
|
|
437
|
+
result = await generateText({
|
|
438
|
+
model: request.model,
|
|
439
|
+
output: Output.object({
|
|
440
|
+
schema,
|
|
441
|
+
name: request.schemaName ?? "extract",
|
|
442
|
+
description: request.schemaDescription
|
|
443
|
+
}),
|
|
444
|
+
providerOptions: {
|
|
445
|
+
openai: {
|
|
446
|
+
strictJsonSchema: request.strict ?? false
|
|
447
|
+
}
|
|
448
|
+
},
|
|
449
|
+
system: request.system,
|
|
450
|
+
messages: request.messages ?? [
|
|
451
|
+
{ role: "user", content: request.user }
|
|
452
|
+
],
|
|
453
|
+
...providerOptions ? { providerOptions } : {}
|
|
454
|
+
});
|
|
455
|
+
} catch (error) {
|
|
456
|
+
const modelId = typeof request.model === "object" && request.model !== null ? request.model.modelId ?? JSON.stringify(request.model) : String(request.model);
|
|
457
|
+
if (error && typeof error === "object" && "responseBody" in error && "statusCode" in error) {
|
|
458
|
+
const apiError = error;
|
|
459
|
+
const responseBody = apiError.responseBody;
|
|
460
|
+
const errorData = apiError.data;
|
|
461
|
+
if (typeof responseBody === "string" && responseBody.includes("No endpoints found that support image input")) {
|
|
462
|
+
throw new Error(
|
|
463
|
+
`Model "${modelId}" does not support image input. Please use a model that supports images (e.g., gpt-4o, claude-3-5-sonnet, gemini-1.5-pro) or remove the --images and --screenshots flags.`
|
|
464
|
+
);
|
|
465
|
+
}
|
|
466
|
+
if (errorData?.code === 500 || errorData?.message?.includes("Internal Server Error")) {
|
|
467
|
+
throw new Error(
|
|
468
|
+
`Provider error for model "${modelId}": Internal server error. The model or provider may be experiencing issues. Please try again or use a different model.`
|
|
469
|
+
);
|
|
470
|
+
}
|
|
471
|
+
if (apiError.statusCode === 401 || errorData?.code === 401) {
|
|
472
|
+
throw new Error(
|
|
473
|
+
`Authentication failed for model "${modelId}". Please check your API key is valid and has the necessary permissions.`
|
|
474
|
+
);
|
|
475
|
+
}
|
|
476
|
+
if (apiError.statusCode === 403 || errorData?.code === 403) {
|
|
477
|
+
throw new Error(
|
|
478
|
+
`Access denied for model "${modelId}". Your API key may not have access to this model. Please check your subscription or try a different model.`
|
|
479
|
+
);
|
|
480
|
+
}
|
|
481
|
+
if (apiError.statusCode === 429 || errorData?.code === 429) {
|
|
482
|
+
throw new Error(
|
|
483
|
+
`Rate limit exceeded for model "${modelId}". Please wait a moment and try again, or use a different model.`
|
|
484
|
+
);
|
|
485
|
+
}
|
|
486
|
+
if (apiError.statusCode === 404 || errorData?.code === 404) {
|
|
487
|
+
const errorMsg = errorData?.message || "Model not found";
|
|
488
|
+
throw new Error(
|
|
489
|
+
`Model "${modelId}" not found or unavailable. ${errorMsg} Please check the model name or try a different model.`
|
|
490
|
+
);
|
|
491
|
+
}
|
|
492
|
+
if (errorData?.message) {
|
|
493
|
+
throw new Error(
|
|
494
|
+
`Provider error for model "${modelId}": ${errorData.message}`
|
|
495
|
+
);
|
|
496
|
+
}
|
|
497
|
+
}
|
|
498
|
+
if (llmSpan && telemetry) {
|
|
499
|
+
const latencyMs = Date.now() - startTime;
|
|
500
|
+
telemetry.recordEvent(llmSpan, {
|
|
501
|
+
type: "llm_call",
|
|
502
|
+
model: modelId,
|
|
503
|
+
provider: "unknown",
|
|
504
|
+
// Will be determined by the model
|
|
505
|
+
input: {
|
|
506
|
+
messages: request.messages ?? [{ role: "user", content: typeof request.user === "string" ? request.user : "" }],
|
|
507
|
+
temperature: void 0,
|
|
508
|
+
maxTokens: void 0,
|
|
509
|
+
schema: request.schema
|
|
510
|
+
},
|
|
511
|
+
error: error instanceof Error ? error : new Error(String(error)),
|
|
512
|
+
latencyMs
|
|
513
|
+
});
|
|
514
|
+
telemetry.endSpan(llmSpan, {
|
|
515
|
+
status: "error",
|
|
516
|
+
error: error instanceof Error ? error : new Error(String(error)),
|
|
517
|
+
latencyMs
|
|
518
|
+
});
|
|
519
|
+
}
|
|
520
|
+
throw error;
|
|
521
|
+
}
|
|
522
|
+
const usageRaw = result.usage ?? {};
|
|
523
|
+
const inputTokens = "promptTokens" in usageRaw ? usageRaw.promptTokens : usageRaw.inputTokens ?? 0;
|
|
524
|
+
const outputTokens = "completionTokens" in usageRaw ? usageRaw.completionTokens : usageRaw.outputTokens ?? 0;
|
|
525
|
+
const totalTokens = "totalTokens" in usageRaw ? usageRaw.totalTokens : inputTokens + outputTokens;
|
|
526
|
+
const usage = {
|
|
527
|
+
inputTokens,
|
|
528
|
+
outputTokens,
|
|
529
|
+
totalTokens
|
|
530
|
+
};
|
|
531
|
+
if (llmSpan && telemetry) {
|
|
532
|
+
const latencyMs = Date.now() - startTime;
|
|
533
|
+
telemetry.recordEvent(llmSpan, {
|
|
534
|
+
type: "llm_call",
|
|
535
|
+
model: typeof request.model === "object" && request.model !== null ? request.model.modelId ?? "unknown" : String(request.model),
|
|
536
|
+
provider: preferredProvider ?? "unknown",
|
|
537
|
+
input: {
|
|
538
|
+
messages: request.messages ?? [{ role: "user", content: typeof request.user === "string" ? request.user : "" }],
|
|
539
|
+
temperature: void 0,
|
|
540
|
+
maxTokens: void 0,
|
|
541
|
+
schema: request.schema
|
|
542
|
+
},
|
|
543
|
+
output: {
|
|
544
|
+
content: JSON.stringify(result.output),
|
|
545
|
+
structured: true,
|
|
546
|
+
usage: {
|
|
547
|
+
input: inputTokens,
|
|
548
|
+
output: outputTokens,
|
|
549
|
+
total: totalTokens
|
|
550
|
+
}
|
|
551
|
+
},
|
|
552
|
+
latencyMs
|
|
553
|
+
});
|
|
554
|
+
telemetry.endSpan(llmSpan, {
|
|
555
|
+
status: "ok",
|
|
556
|
+
output: result.output,
|
|
557
|
+
latencyMs
|
|
558
|
+
});
|
|
559
|
+
}
|
|
560
|
+
return { data: result.output, usage };
|
|
561
|
+
};
|
|
562
|
+
|
|
563
|
+
// src/llm/RetryingRunner.ts
|
|
564
|
+
var runWithRetries = async (options) => {
|
|
565
|
+
const { telemetry, parentSpan } = options;
|
|
566
|
+
const retrySpan = telemetry?.startSpan({
|
|
567
|
+
name: "struktur.validation_retry",
|
|
568
|
+
kind: "CHAIN",
|
|
569
|
+
parentSpan,
|
|
570
|
+
attributes: {
|
|
571
|
+
"retry.max_attempts": options.maxAttempts ?? 3,
|
|
572
|
+
"retry.schema_name": options.schemaName ?? "extract"
|
|
573
|
+
}
|
|
574
|
+
});
|
|
575
|
+
const ajv = createAjv();
|
|
576
|
+
const maxAttempts = options.maxAttempts ?? 3;
|
|
577
|
+
const messages = [{ role: "user", content: options.user }];
|
|
578
|
+
const debug = options.debug;
|
|
579
|
+
const callId = options.callId ?? `call_${Date.now()}_${Math.random().toString(36).slice(2, 11)}`;
|
|
580
|
+
let usage = { inputTokens: 0, outputTokens: 0, totalTokens: 0 };
|
|
581
|
+
let lastError;
|
|
582
|
+
const systemLength = options.system.length;
|
|
583
|
+
const userLength = typeof options.user === "string" ? options.user.length : JSON.stringify(options.user).length;
|
|
584
|
+
debug?.llmCallStart({
|
|
585
|
+
callId,
|
|
586
|
+
model: JSON.stringify(options.model),
|
|
587
|
+
schemaName: options.schemaName,
|
|
588
|
+
systemLength,
|
|
589
|
+
userLength,
|
|
590
|
+
artifactCount: Array.isArray(options.user) ? options.user.length : 0
|
|
591
|
+
});
|
|
592
|
+
debug?.promptSystem({ callId, system: options.system });
|
|
593
|
+
debug?.promptUser({ callId, user: options.user });
|
|
594
|
+
for (let attempt = 1; attempt <= maxAttempts; attempt += 1) {
|
|
595
|
+
const executor = options.execute ?? generateStructured;
|
|
596
|
+
const isFinalAttempt = attempt === maxAttempts;
|
|
597
|
+
const useStrictValidation = options.strict === true || isFinalAttempt;
|
|
598
|
+
debug?.validationStart({
|
|
599
|
+
callId,
|
|
600
|
+
attempt,
|
|
601
|
+
maxAttempts,
|
|
602
|
+
strict: useStrictValidation
|
|
603
|
+
});
|
|
604
|
+
const startTime = Date.now();
|
|
605
|
+
const result = await executor({
|
|
606
|
+
model: options.model,
|
|
607
|
+
schema: options.schema,
|
|
608
|
+
schemaName: options.schemaName,
|
|
609
|
+
system: options.system,
|
|
610
|
+
user: options.user,
|
|
611
|
+
messages,
|
|
612
|
+
strict: options.strict,
|
|
613
|
+
telemetry,
|
|
614
|
+
parentSpan: retrySpan
|
|
615
|
+
});
|
|
616
|
+
const durationMs = Date.now() - startTime;
|
|
617
|
+
usage = {
|
|
618
|
+
inputTokens: usage.inputTokens + result.usage.inputTokens,
|
|
619
|
+
outputTokens: usage.outputTokens + result.usage.outputTokens,
|
|
620
|
+
totalTokens: usage.totalTokens + result.usage.totalTokens
|
|
621
|
+
};
|
|
622
|
+
debug?.rawResponse({ callId, response: result.data });
|
|
623
|
+
try {
|
|
624
|
+
if (useStrictValidation) {
|
|
625
|
+
const validated = validateOrThrow(
|
|
626
|
+
ajv,
|
|
627
|
+
options.schema,
|
|
628
|
+
result.data
|
|
629
|
+
);
|
|
630
|
+
debug?.validationSuccess({ callId, attempt });
|
|
631
|
+
debug?.llmCallComplete({
|
|
632
|
+
callId,
|
|
633
|
+
success: true,
|
|
634
|
+
inputTokens: usage.inputTokens,
|
|
635
|
+
outputTokens: usage.outputTokens,
|
|
636
|
+
totalTokens: usage.totalTokens,
|
|
637
|
+
durationMs
|
|
638
|
+
});
|
|
639
|
+
if (retrySpan && telemetry) {
|
|
640
|
+
telemetry.recordEvent(retrySpan, {
|
|
641
|
+
type: "validation",
|
|
642
|
+
attempt,
|
|
643
|
+
maxAttempts,
|
|
644
|
+
schema: options.schema,
|
|
645
|
+
input: result.data,
|
|
646
|
+
success: true,
|
|
647
|
+
latencyMs: durationMs
|
|
648
|
+
});
|
|
649
|
+
telemetry.endSpan(retrySpan, {
|
|
650
|
+
status: "ok",
|
|
651
|
+
output: validated,
|
|
652
|
+
latencyMs: durationMs
|
|
653
|
+
});
|
|
654
|
+
}
|
|
655
|
+
return { data: validated, usage };
|
|
656
|
+
} else {
|
|
657
|
+
const validationResult = validateAllowingMissingRequired(
|
|
658
|
+
ajv,
|
|
659
|
+
options.schema,
|
|
660
|
+
result.data,
|
|
661
|
+
isFinalAttempt
|
|
662
|
+
);
|
|
663
|
+
if (validationResult.valid) {
|
|
664
|
+
debug?.validationSuccess({ callId, attempt });
|
|
665
|
+
debug?.llmCallComplete({
|
|
666
|
+
callId,
|
|
667
|
+
success: true,
|
|
668
|
+
inputTokens: usage.inputTokens,
|
|
669
|
+
outputTokens: usage.outputTokens,
|
|
670
|
+
totalTokens: usage.totalTokens,
|
|
671
|
+
durationMs
|
|
672
|
+
});
|
|
673
|
+
if (retrySpan && telemetry) {
|
|
674
|
+
telemetry.recordEvent(retrySpan, {
|
|
675
|
+
type: "validation",
|
|
676
|
+
attempt,
|
|
677
|
+
maxAttempts,
|
|
678
|
+
schema: options.schema,
|
|
679
|
+
input: result.data,
|
|
680
|
+
success: true,
|
|
681
|
+
latencyMs: durationMs
|
|
682
|
+
});
|
|
683
|
+
telemetry.endSpan(retrySpan, {
|
|
684
|
+
status: "ok",
|
|
685
|
+
output: validationResult.data,
|
|
686
|
+
latencyMs: durationMs
|
|
687
|
+
});
|
|
688
|
+
}
|
|
689
|
+
return { data: validationResult.data, usage };
|
|
690
|
+
}
|
|
691
|
+
throw new SchemaValidationError(
|
|
692
|
+
"Schema validation failed",
|
|
693
|
+
validationResult.errors
|
|
694
|
+
);
|
|
695
|
+
}
|
|
696
|
+
} catch (error) {
|
|
697
|
+
lastError = error;
|
|
698
|
+
if (error instanceof SchemaValidationError) {
|
|
699
|
+
debug?.validationFailed({
|
|
700
|
+
callId,
|
|
701
|
+
attempt,
|
|
702
|
+
errors: error.errors
|
|
703
|
+
});
|
|
704
|
+
if (retrySpan && telemetry) {
|
|
705
|
+
telemetry.recordEvent(retrySpan, {
|
|
706
|
+
type: "validation",
|
|
707
|
+
attempt,
|
|
708
|
+
maxAttempts,
|
|
709
|
+
schema: options.schema,
|
|
710
|
+
input: result.data,
|
|
711
|
+
success: false,
|
|
712
|
+
errors: error.errors,
|
|
713
|
+
latencyMs: durationMs
|
|
714
|
+
});
|
|
715
|
+
}
|
|
716
|
+
const nextAttempt = attempt + 1;
|
|
717
|
+
if (nextAttempt <= maxAttempts) {
|
|
718
|
+
await options.events?.onRetry?.({
|
|
719
|
+
attempt: nextAttempt,
|
|
720
|
+
maxAttempts,
|
|
721
|
+
reason: "schema_validation_failed"
|
|
722
|
+
});
|
|
723
|
+
debug?.retry({
|
|
724
|
+
callId,
|
|
725
|
+
attempt: nextAttempt,
|
|
726
|
+
maxAttempts,
|
|
727
|
+
reason: "schema_validation_failed"
|
|
728
|
+
});
|
|
729
|
+
}
|
|
730
|
+
const errorPayload = JSON.stringify(error.errors, null, 2);
|
|
731
|
+
const errorMessage = `<validation-errors>
|
|
732
|
+
${errorPayload}
|
|
733
|
+
</validation-errors>`;
|
|
734
|
+
messages.push({ role: "user", content: errorMessage });
|
|
735
|
+
await options.events?.onMessage?.({
|
|
736
|
+
role: "user",
|
|
737
|
+
content: errorMessage
|
|
738
|
+
});
|
|
739
|
+
continue;
|
|
740
|
+
}
|
|
741
|
+
debug?.llmCallComplete({
|
|
742
|
+
callId,
|
|
743
|
+
success: false,
|
|
744
|
+
inputTokens: usage.inputTokens,
|
|
745
|
+
outputTokens: usage.outputTokens,
|
|
746
|
+
totalTokens: usage.totalTokens,
|
|
747
|
+
durationMs,
|
|
748
|
+
error: error.message
|
|
749
|
+
});
|
|
750
|
+
if (retrySpan && telemetry) {
|
|
751
|
+
telemetry.endSpan(retrySpan, {
|
|
752
|
+
status: "error",
|
|
753
|
+
error,
|
|
754
|
+
latencyMs: durationMs
|
|
755
|
+
});
|
|
756
|
+
}
|
|
757
|
+
break;
|
|
758
|
+
}
|
|
759
|
+
}
|
|
760
|
+
throw lastError ?? new Error("Unknown extraction error");
|
|
761
|
+
};
|
|
762
|
+
|
|
763
|
+
// src/strategies/utils.ts
|
|
764
|
+
var serializeSchema = (schema) => {
|
|
765
|
+
return JSON.stringify(schema);
|
|
766
|
+
};
|
|
767
|
+
var mergeUsage = (usages) => {
|
|
768
|
+
return usages.reduce(
|
|
769
|
+
(acc, usage) => ({
|
|
770
|
+
inputTokens: acc.inputTokens + usage.inputTokens,
|
|
771
|
+
outputTokens: acc.outputTokens + usage.outputTokens,
|
|
772
|
+
totalTokens: acc.totalTokens + usage.totalTokens
|
|
773
|
+
}),
|
|
774
|
+
{ inputTokens: 0, outputTokens: 0, totalTokens: 0 }
|
|
775
|
+
);
|
|
776
|
+
};
|
|
777
|
+
var getBatches = (artifacts, options, debug, telemetry, parentSpan) => {
|
|
778
|
+
const chunkingSpan = telemetry?.startSpan({
|
|
779
|
+
name: "struktur.chunking",
|
|
780
|
+
kind: "RETRIEVER",
|
|
781
|
+
parentSpan,
|
|
782
|
+
attributes: {
|
|
783
|
+
"chunking.artifact_count": artifacts.length,
|
|
784
|
+
"chunking.max_tokens": options.maxTokens,
|
|
785
|
+
"chunking.max_images": options.maxImages
|
|
786
|
+
}
|
|
787
|
+
});
|
|
788
|
+
const batches = batchArtifacts(artifacts, { ...options, debug });
|
|
789
|
+
if (chunkingSpan && telemetry) {
|
|
790
|
+
batches.forEach((batch, index) => {
|
|
791
|
+
telemetry.recordEvent(chunkingSpan, {
|
|
792
|
+
type: "chunk",
|
|
793
|
+
chunkIndex: index,
|
|
794
|
+
totalChunks: batches.length,
|
|
795
|
+
tokens: batch.reduce((sum, a) => sum + (a.tokens || 0), 0),
|
|
796
|
+
images: batch.reduce((sum, a) => sum + (a.contents?.flatMap((c) => c.media || []).length || 0), 0)
|
|
797
|
+
});
|
|
798
|
+
});
|
|
799
|
+
telemetry.endSpan(chunkingSpan, {
|
|
800
|
+
status: "ok",
|
|
801
|
+
output: { batchCount: batches.length }
|
|
802
|
+
});
|
|
803
|
+
}
|
|
804
|
+
return batches;
|
|
805
|
+
};
|
|
806
|
+
var extractWithPrompt = async (options) => {
|
|
807
|
+
const userContent = buildUserContent(options.user, options.artifacts);
|
|
808
|
+
const result = await runWithRetries({
|
|
809
|
+
model: options.model,
|
|
810
|
+
schema: options.schema,
|
|
811
|
+
system: options.system,
|
|
812
|
+
user: userContent,
|
|
813
|
+
events: options.events,
|
|
814
|
+
execute: options.execute,
|
|
815
|
+
strict: options.strict,
|
|
816
|
+
debug: options.debug,
|
|
817
|
+
callId: options.callId,
|
|
818
|
+
telemetry: options.telemetry,
|
|
819
|
+
parentSpan: options.parentSpan
|
|
820
|
+
});
|
|
821
|
+
return result;
|
|
822
|
+
};
|
|
823
|
+
|
|
824
|
+
// src/strategies/SimpleStrategy.ts
|
|
825
|
+
var SimpleStrategy = class {
|
|
826
|
+
name = "simple";
|
|
827
|
+
config;
|
|
828
|
+
constructor(config) {
|
|
829
|
+
this.config = config;
|
|
830
|
+
}
|
|
831
|
+
getEstimatedSteps() {
|
|
832
|
+
return 3;
|
|
833
|
+
}
|
|
834
|
+
async run(options) {
|
|
835
|
+
const debug = options.debug;
|
|
836
|
+
const telemetry = options.telemetry ?? void 0;
|
|
837
|
+
const strategySpan = telemetry?.startSpan({
|
|
838
|
+
name: "strategy.simple",
|
|
839
|
+
kind: "CHAIN",
|
|
840
|
+
attributes: {
|
|
841
|
+
"strategy.name": this.name,
|
|
842
|
+
"strategy.artifacts.count": options.artifacts.length
|
|
843
|
+
}
|
|
844
|
+
});
|
|
845
|
+
const schema = serializeSchema(options.schema);
|
|
846
|
+
const { system, user } = buildExtractorPrompt(
|
|
847
|
+
options.artifacts,
|
|
848
|
+
schema,
|
|
849
|
+
this.config.outputInstructions
|
|
850
|
+
);
|
|
851
|
+
await options.events?.onStep?.({
|
|
852
|
+
step: 1,
|
|
853
|
+
total: this.getEstimatedSteps(),
|
|
854
|
+
label: "extract"
|
|
855
|
+
});
|
|
856
|
+
debug?.step({
|
|
857
|
+
step: 1,
|
|
858
|
+
total: this.getEstimatedSteps(),
|
|
859
|
+
label: "extract",
|
|
860
|
+
strategy: this.name
|
|
861
|
+
});
|
|
862
|
+
const result = await extractWithPrompt({
|
|
863
|
+
model: this.config.model,
|
|
864
|
+
schema: options.schema,
|
|
865
|
+
system,
|
|
866
|
+
user,
|
|
867
|
+
artifacts: options.artifacts,
|
|
868
|
+
events: options.events,
|
|
869
|
+
execute: this.config.execute,
|
|
870
|
+
strict: options.strict ?? this.config.strict,
|
|
871
|
+
debug,
|
|
872
|
+
callId: "simple_extract",
|
|
873
|
+
telemetry,
|
|
874
|
+
parentSpan: strategySpan
|
|
875
|
+
});
|
|
876
|
+
debug?.step({
|
|
877
|
+
step: 2,
|
|
878
|
+
total: this.getEstimatedSteps(),
|
|
879
|
+
label: "complete",
|
|
880
|
+
strategy: this.name
|
|
881
|
+
});
|
|
882
|
+
telemetry?.endSpan(strategySpan, {
|
|
883
|
+
status: "ok",
|
|
884
|
+
output: result.data
|
|
885
|
+
});
|
|
886
|
+
return { data: result.data, usage: result.usage };
|
|
887
|
+
}
|
|
888
|
+
};
|
|
889
|
+
var simple = (config) => {
|
|
890
|
+
return new SimpleStrategy(config);
|
|
891
|
+
};
|
|
892
|
+
|
|
893
|
+
// src/prompts/ParallelMergerPrompt.ts
|
|
894
|
+
var buildParallelMergerPrompt = (schema, dataList) => {
|
|
895
|
+
const jsonObjects = dataList.filter((item) => item !== null && item !== void 0).map((item) => JSON.stringify(item)).map((json) => `<json-object>${json}</json-object>`).join("\n");
|
|
896
|
+
const system = `You are a data merger. Combine multiple JSON objects into one object matching the provided schema.
|
|
897
|
+
|
|
898
|
+
<thinking>
|
|
899
|
+
Before merging, consider:
|
|
900
|
+
1. Which input objects contain data for each schema field?
|
|
901
|
+
2. How should conflicting values be resolved (prefer more complete/recent data)?
|
|
902
|
+
3. Are there arrays that need to be concatenated vs deduplicated?
|
|
903
|
+
4. Ensure NO information is lost from any input
|
|
904
|
+
</thinking>
|
|
905
|
+
|
|
906
|
+
<rules>
|
|
907
|
+
- Produce a single JSON object following the schema exactly
|
|
908
|
+
- Combine all information from input objects without losing data
|
|
909
|
+
- Resolve conflicts intelligently (prefer richer/more specific data)
|
|
910
|
+
- Output ONLY valid JSON - no markdown, no explanations
|
|
911
|
+
</rules>`;
|
|
912
|
+
const user = `<json-schema>
|
|
913
|
+
${schema}
|
|
914
|
+
</json-schema>
|
|
915
|
+
|
|
916
|
+
<json-objects>
|
|
917
|
+
${jsonObjects}
|
|
918
|
+
</json-objects>`;
|
|
919
|
+
return { system, user };
|
|
920
|
+
};
|
|
921
|
+
|
|
922
|
+
// src/strategies/concurrency.ts
|
|
923
|
+
var runConcurrently = async (tasks, concurrency) => {
|
|
924
|
+
const results = [];
|
|
925
|
+
for (let i = 0; i < tasks.length; i += concurrency) {
|
|
926
|
+
const chunk = tasks.slice(i, i + concurrency).map((task) => task());
|
|
927
|
+
const chunkResults = await Promise.all(chunk);
|
|
928
|
+
results.push(...chunkResults);
|
|
929
|
+
}
|
|
930
|
+
return results;
|
|
931
|
+
};
|
|
932
|
+
|
|
933
|
+
// src/strategies/ParallelStrategy.ts
|
|
934
|
+
var ParallelStrategy = class {
|
|
935
|
+
name = "parallel";
|
|
936
|
+
config;
|
|
937
|
+
constructor(config) {
|
|
938
|
+
this.config = config;
|
|
939
|
+
}
|
|
940
|
+
getEstimatedSteps(artifacts) {
|
|
941
|
+
const batches = getBatches(artifacts, {
|
|
942
|
+
maxTokens: this.config.chunkSize,
|
|
943
|
+
maxImages: this.config.maxImages
|
|
944
|
+
});
|
|
945
|
+
return batches.length + 3;
|
|
946
|
+
}
|
|
947
|
+
async run(options) {
|
|
948
|
+
const debug = options.debug;
|
|
949
|
+
const { telemetry } = options;
|
|
950
|
+
const strategySpan = telemetry?.startSpan({
|
|
951
|
+
name: "strategy.parallel",
|
|
952
|
+
kind: "CHAIN",
|
|
953
|
+
attributes: {
|
|
954
|
+
"strategy.name": this.name,
|
|
955
|
+
"strategy.artifacts.count": options.artifacts.length,
|
|
956
|
+
"strategy.chunk_size": this.config.chunkSize,
|
|
957
|
+
"strategy.concurrency": this.config.concurrency
|
|
958
|
+
}
|
|
959
|
+
});
|
|
960
|
+
const batches = getBatches(
|
|
961
|
+
options.artifacts,
|
|
962
|
+
{
|
|
963
|
+
maxTokens: this.config.chunkSize,
|
|
964
|
+
maxImages: this.config.maxImages
|
|
965
|
+
},
|
|
966
|
+
debug,
|
|
967
|
+
telemetry ?? void 0,
|
|
968
|
+
strategySpan
|
|
969
|
+
);
|
|
970
|
+
const schema = serializeSchema(options.schema);
|
|
971
|
+
const totalSteps = this.getEstimatedSteps(options.artifacts);
|
|
972
|
+
let step = 1;
|
|
973
|
+
await options.events?.onStep?.({
|
|
974
|
+
step,
|
|
975
|
+
total: totalSteps,
|
|
976
|
+
label: batches.length > 1 ? `batch 1/${batches.length}` : "extract"
|
|
977
|
+
});
|
|
978
|
+
debug?.step({
|
|
979
|
+
step,
|
|
980
|
+
total: totalSteps,
|
|
981
|
+
label: batches.length > 1 ? `batch 1/${batches.length}` : "extract",
|
|
982
|
+
strategy: this.name
|
|
983
|
+
});
|
|
984
|
+
const tasks = batches.map((batch, index) => async () => {
|
|
985
|
+
const prompt = buildExtractorPrompt(
|
|
986
|
+
batch,
|
|
987
|
+
schema,
|
|
988
|
+
this.config.outputInstructions
|
|
989
|
+
);
|
|
990
|
+
const result = await extractWithPrompt({
|
|
991
|
+
model: this.config.model,
|
|
992
|
+
schema: options.schema,
|
|
993
|
+
system: prompt.system,
|
|
994
|
+
user: prompt.user,
|
|
995
|
+
artifacts: batch,
|
|
996
|
+
events: options.events,
|
|
997
|
+
execute: this.config.execute,
|
|
998
|
+
strict: options.strict ?? this.config.strict,
|
|
999
|
+
debug,
|
|
1000
|
+
callId: `parallel_batch_${index + 1}`,
|
|
1001
|
+
telemetry: telemetry ?? void 0,
|
|
1002
|
+
parentSpan: strategySpan
|
|
1003
|
+
});
|
|
1004
|
+
const completedIndex = index + 1;
|
|
1005
|
+
if (completedIndex < batches.length) {
|
|
1006
|
+
step += 1;
|
|
1007
|
+
await options.events?.onStep?.({
|
|
1008
|
+
step,
|
|
1009
|
+
total: totalSteps,
|
|
1010
|
+
label: `batch ${completedIndex + 1}/${batches.length}`
|
|
1011
|
+
});
|
|
1012
|
+
debug?.step({
|
|
1013
|
+
step,
|
|
1014
|
+
total: totalSteps,
|
|
1015
|
+
label: `batch ${completedIndex + 1}/${batches.length}`,
|
|
1016
|
+
strategy: this.name
|
|
1017
|
+
});
|
|
1018
|
+
}
|
|
1019
|
+
return result;
|
|
1020
|
+
});
|
|
1021
|
+
const results = await runConcurrently(
|
|
1022
|
+
tasks,
|
|
1023
|
+
this.config.concurrency ?? batches.length
|
|
1024
|
+
);
|
|
1025
|
+
debug?.mergeStart({
|
|
1026
|
+
mergeId: "parallel_merge",
|
|
1027
|
+
inputCount: results.length,
|
|
1028
|
+
strategy: this.name
|
|
1029
|
+
});
|
|
1030
|
+
const mergeSpan = telemetry?.startSpan({
|
|
1031
|
+
name: "struktur.merge",
|
|
1032
|
+
kind: "CHAIN",
|
|
1033
|
+
parentSpan: strategySpan,
|
|
1034
|
+
attributes: {
|
|
1035
|
+
"merge.strategy": "parallel",
|
|
1036
|
+
"merge.input_count": results.length
|
|
1037
|
+
}
|
|
1038
|
+
});
|
|
1039
|
+
const mergePrompt = buildParallelMergerPrompt(
|
|
1040
|
+
schema,
|
|
1041
|
+
results.map((r) => r.data)
|
|
1042
|
+
);
|
|
1043
|
+
const merged = await extractWithPrompt({
|
|
1044
|
+
model: this.config.mergeModel,
|
|
1045
|
+
schema: options.schema,
|
|
1046
|
+
system: mergePrompt.system,
|
|
1047
|
+
user: mergePrompt.user,
|
|
1048
|
+
artifacts: [],
|
|
1049
|
+
events: options.events,
|
|
1050
|
+
execute: this.config.execute,
|
|
1051
|
+
strict: this.config.strict,
|
|
1052
|
+
debug,
|
|
1053
|
+
callId: "parallel_merge",
|
|
1054
|
+
telemetry: telemetry ?? void 0,
|
|
1055
|
+
parentSpan: mergeSpan
|
|
1056
|
+
});
|
|
1057
|
+
step += 1;
|
|
1058
|
+
await options.events?.onStep?.({
|
|
1059
|
+
step,
|
|
1060
|
+
total: totalSteps,
|
|
1061
|
+
label: "merge"
|
|
1062
|
+
});
|
|
1063
|
+
debug?.step({
|
|
1064
|
+
step,
|
|
1065
|
+
total: totalSteps,
|
|
1066
|
+
label: "merge",
|
|
1067
|
+
strategy: this.name
|
|
1068
|
+
});
|
|
1069
|
+
debug?.mergeComplete({ mergeId: "parallel_merge", success: true });
|
|
1070
|
+
if (mergeSpan && telemetry) {
|
|
1071
|
+
telemetry.recordEvent(mergeSpan, {
|
|
1072
|
+
type: "merge",
|
|
1073
|
+
strategy: "parallel",
|
|
1074
|
+
inputCount: results.length,
|
|
1075
|
+
outputCount: 1
|
|
1076
|
+
});
|
|
1077
|
+
telemetry.endSpan(mergeSpan, {
|
|
1078
|
+
status: "ok",
|
|
1079
|
+
output: merged.data
|
|
1080
|
+
});
|
|
1081
|
+
}
|
|
1082
|
+
telemetry?.endSpan(strategySpan, {
|
|
1083
|
+
status: "ok",
|
|
1084
|
+
output: merged.data
|
|
1085
|
+
});
|
|
1086
|
+
return {
|
|
1087
|
+
data: merged.data,
|
|
1088
|
+
usage: mergeUsage([...results.map((r) => r.usage), merged.usage])
|
|
1089
|
+
};
|
|
1090
|
+
}
|
|
1091
|
+
};
|
|
1092
|
+
var parallel = (config) => {
|
|
1093
|
+
return new ParallelStrategy(config);
|
|
1094
|
+
};
|
|
1095
|
+
|
|
1096
|
+
// src/prompts/SequentialExtractorPrompt.ts
|
|
1097
|
+
var sequentialSystemPrompt = (schema, outputInstructions) => {
|
|
1098
|
+
return `<instructions>
|
|
1099
|
+
You are a precise data extraction engine. Extract data from provided artifacts according to the JSON schema, enriching any previous data you receive.
|
|
1100
|
+
|
|
1101
|
+
<thinking>
|
|
1102
|
+
Before extracting, consider:
|
|
1103
|
+
1. Review previous data - what needs to be preserved vs enriched?
|
|
1104
|
+
2. Which new fields have clear values in the artifacts?
|
|
1105
|
+
3. Which fields remain missing or unclear (keep null from previous or set to null)?
|
|
1106
|
+
4. Can new information improve the structure of existing data?
|
|
1107
|
+
5. Ensure NO information is lost from previous data
|
|
1108
|
+
</thinking>
|
|
1109
|
+
|
|
1110
|
+
<rules>
|
|
1111
|
+
- Merge new artifacts into existing data - do not create fresh objects
|
|
1112
|
+
- Preserve ALL previous data - losing information breaks the processing chain
|
|
1113
|
+
- Use null for missing/uncertain values in new fields
|
|
1114
|
+
- Only extract information explicitly present in the artifacts
|
|
1115
|
+
- Output ONLY valid JSON matching the schema
|
|
1116
|
+
- No markdown, explanations, or code fences
|
|
1117
|
+
</rules>
|
|
1118
|
+
|
|
1119
|
+
<image-handling>
|
|
1120
|
+
Some schema properties may reference artifact IDs (e.g., 'xxx_artifact_id' fields).
|
|
1121
|
+
When assigning images to properties:
|
|
1122
|
+
- Use format: artifact:ID/images/imageNUM.EXT (e.g., 'artifact:123456/images/image1.jpg')
|
|
1123
|
+
- Only reference images you can actually see in the provided documents/images
|
|
1124
|
+
- Image references are visible in artifact XML or written on images
|
|
1125
|
+
- NEVER make up artifact IDs or use normal URLs
|
|
1126
|
+
</image-handling>
|
|
1127
|
+
|
|
1128
|
+
<output-instructions>
|
|
1129
|
+
${outputInstructions ?? "No additional output instructions provided."}
|
|
1130
|
+
</output-instructions>
|
|
1131
|
+
|
|
1132
|
+
<json-schema>
|
|
1133
|
+
${schema}
|
|
1134
|
+
</json-schema>
|
|
1135
|
+
|
|
1136
|
+
<how-to-output>
|
|
1137
|
+
Return the complete extracted data as valid JSON matching the schema.
|
|
1138
|
+
Include all information from previous data, enriched with the new artifacts.
|
|
1139
|
+
</how-to-output>
|
|
1140
|
+
</instructions>`;
|
|
1141
|
+
};
|
|
1142
|
+
var sequentialUserPrompt = (artifactsXml, previousData, outputInstructions) => {
|
|
1143
|
+
return `${artifactsXml}
|
|
1144
|
+
|
|
1145
|
+
<previous-data>
|
|
1146
|
+
${previousData}
|
|
1147
|
+
</previous-data>
|
|
1148
|
+
|
|
1149
|
+
<task>
|
|
1150
|
+
Extract the contents of the given artifacts and ADD/MERGE them into the previous data contained in the <previous-data> tag.
|
|
1151
|
+
You MUST NOT lose any information from the previous data. All previous data must be included in your response.
|
|
1152
|
+
</task>
|
|
1153
|
+
|
|
1154
|
+
<output-instructions>
|
|
1155
|
+
${outputInstructions ?? ""}
|
|
1156
|
+
</output-instructions>`;
|
|
1157
|
+
};
|
|
1158
|
+
var buildSequentialPrompt = (artifacts, schema, previousData, outputInstructions) => {
|
|
1159
|
+
const artifactsXml = formatArtifactsXml(artifacts);
|
|
1160
|
+
return {
|
|
1161
|
+
system: sequentialSystemPrompt(schema, outputInstructions),
|
|
1162
|
+
user: sequentialUserPrompt(artifactsXml, previousData, outputInstructions)
|
|
1163
|
+
};
|
|
1164
|
+
};
|
|
1165
|
+
|
|
1166
|
+
// src/strategies/SequentialStrategy.ts
|
|
1167
|
+
var SequentialStrategy = class {
|
|
1168
|
+
name = "sequential";
|
|
1169
|
+
config;
|
|
1170
|
+
constructor(config) {
|
|
1171
|
+
this.config = config;
|
|
1172
|
+
}
|
|
1173
|
+
getEstimatedSteps(artifacts) {
|
|
1174
|
+
const batches = getBatches(artifacts, {
|
|
1175
|
+
maxTokens: this.config.chunkSize,
|
|
1176
|
+
maxImages: this.config.maxImages
|
|
1177
|
+
});
|
|
1178
|
+
return batches.length + 2;
|
|
1179
|
+
}
|
|
1180
|
+
async run(options) {
|
|
1181
|
+
const debug = options.debug;
|
|
1182
|
+
const { telemetry } = options;
|
|
1183
|
+
const strategySpan = telemetry?.startSpan({
|
|
1184
|
+
name: "strategy.sequential",
|
|
1185
|
+
kind: "CHAIN",
|
|
1186
|
+
attributes: {
|
|
1187
|
+
"strategy.name": this.name,
|
|
1188
|
+
"strategy.artifacts.count": options.artifacts.length,
|
|
1189
|
+
"strategy.chunk_size": this.config.chunkSize
|
|
1190
|
+
}
|
|
1191
|
+
});
|
|
1192
|
+
const batches = getBatches(
|
|
1193
|
+
options.artifacts,
|
|
1194
|
+
{
|
|
1195
|
+
maxTokens: this.config.chunkSize,
|
|
1196
|
+
maxImages: this.config.maxImages
|
|
1197
|
+
},
|
|
1198
|
+
debug,
|
|
1199
|
+
telemetry ?? void 0,
|
|
1200
|
+
strategySpan
|
|
1201
|
+
);
|
|
1202
|
+
const schema = serializeSchema(options.schema);
|
|
1203
|
+
let currentData;
|
|
1204
|
+
const usages = [];
|
|
1205
|
+
const totalSteps = this.getEstimatedSteps(options.artifacts);
|
|
1206
|
+
let step = 1;
|
|
1207
|
+
await options.events?.onStep?.({
|
|
1208
|
+
step,
|
|
1209
|
+
total: totalSteps,
|
|
1210
|
+
label: batches.length > 1 ? `batch 1/${batches.length}` : "extract"
|
|
1211
|
+
});
|
|
1212
|
+
debug?.step({
|
|
1213
|
+
step,
|
|
1214
|
+
total: totalSteps,
|
|
1215
|
+
label: batches.length > 1 ? `batch 1/${batches.length}` : "extract",
|
|
1216
|
+
strategy: this.name
|
|
1217
|
+
});
|
|
1218
|
+
for (const [index, batch] of batches.entries()) {
|
|
1219
|
+
const previousData = currentData ? JSON.stringify(currentData) : "{}";
|
|
1220
|
+
const prompt = buildSequentialPrompt(
|
|
1221
|
+
batch,
|
|
1222
|
+
schema,
|
|
1223
|
+
previousData,
|
|
1224
|
+
this.config.outputInstructions
|
|
1225
|
+
);
|
|
1226
|
+
const result = await extractWithPrompt({
|
|
1227
|
+
model: this.config.model,
|
|
1228
|
+
schema: options.schema,
|
|
1229
|
+
system: prompt.system,
|
|
1230
|
+
user: prompt.user,
|
|
1231
|
+
artifacts: batch,
|
|
1232
|
+
events: options.events,
|
|
1233
|
+
execute: this.config.execute,
|
|
1234
|
+
strict: options.strict ?? this.config.strict,
|
|
1235
|
+
debug,
|
|
1236
|
+
callId: `sequential_batch_${index + 1}`,
|
|
1237
|
+
telemetry: telemetry ?? void 0,
|
|
1238
|
+
parentSpan: strategySpan
|
|
1239
|
+
});
|
|
1240
|
+
currentData = result.data;
|
|
1241
|
+
usages.push(result.usage);
|
|
1242
|
+
step += 1;
|
|
1243
|
+
if (index < batches.length - 1) {
|
|
1244
|
+
await options.events?.onStep?.({
|
|
1245
|
+
step,
|
|
1246
|
+
total: totalSteps,
|
|
1247
|
+
label: `batch ${index + 2}/${batches.length}`
|
|
1248
|
+
});
|
|
1249
|
+
debug?.step({
|
|
1250
|
+
step,
|
|
1251
|
+
total: totalSteps,
|
|
1252
|
+
label: `batch ${index + 2}/${batches.length}`,
|
|
1253
|
+
strategy: this.name
|
|
1254
|
+
});
|
|
1255
|
+
}
|
|
1256
|
+
}
|
|
1257
|
+
if (!currentData) {
|
|
1258
|
+
throw new Error("No data extracted from sequential strategy");
|
|
1259
|
+
}
|
|
1260
|
+
telemetry?.endSpan(strategySpan, {
|
|
1261
|
+
status: "ok",
|
|
1262
|
+
output: currentData
|
|
1263
|
+
});
|
|
1264
|
+
return { data: currentData, usage: mergeUsage(usages) };
|
|
1265
|
+
}
|
|
1266
|
+
};
|
|
1267
|
+
var sequential = (config) => {
|
|
1268
|
+
return new SequentialStrategy(config);
|
|
1269
|
+
};
|
|
1270
|
+
|
|
1271
|
+
// src/prompts/DeduplicationPrompt.ts
|
|
1272
|
+
var buildDeduplicationPrompt = (schema, data, exampleKeys = ["items.3", "items.5"]) => {
|
|
1273
|
+
const system = `You are a deduplication engine. Identify duplicate entries in structured data.
|
|
1274
|
+
|
|
1275
|
+
<thinking>
|
|
1276
|
+
Before deduplicating, consider:
|
|
1277
|
+
1. Which fields indicate uniqueness for each entity type?
|
|
1278
|
+
2. Are entries duplicates if they share key fields but differ in minor details?
|
|
1279
|
+
3. Which entry should be kept (prefer more complete data)?
|
|
1280
|
+
</thinking>
|
|
1281
|
+
|
|
1282
|
+
<rules>
|
|
1283
|
+
- Identify entries that represent the same entity
|
|
1284
|
+
- Return paths to duplicates using dot notation (e.g., "items.3", "items.5")
|
|
1285
|
+
- Output ONLY JSON in format: { "keys": ["path1", "path2"] }
|
|
1286
|
+
- No markdown, no explanations
|
|
1287
|
+
</rules>`;
|
|
1288
|
+
const user = `<json-schema>
|
|
1289
|
+
${schema}
|
|
1290
|
+
</json-schema>
|
|
1291
|
+
|
|
1292
|
+
<json-data>
|
|
1293
|
+
${JSON.stringify(data)}
|
|
1294
|
+
</json-data>
|
|
1295
|
+
|
|
1296
|
+
<task>Identify duplicate entries in the data and return their paths in the format: { "keys": ["path1", "path2"] }</task>
|
|
1297
|
+
|
|
1298
|
+
<example>
|
|
1299
|
+
If items at indices 3 and 5 are duplicates, return: { "keys": ["items.3", "items.5"] }
|
|
1300
|
+
</example>`;
|
|
1301
|
+
return { system, user };
|
|
1302
|
+
};
|
|
1303
|
+
|
|
1304
|
+
// src/merge/SmartDataMerger.ts
|
|
1305
|
+
var isArraySchema = (schema) => {
|
|
1306
|
+
if (schema.type === "array") {
|
|
1307
|
+
return true;
|
|
1308
|
+
}
|
|
1309
|
+
return false;
|
|
1310
|
+
};
|
|
1311
|
+
var isObjectSchema = (schema) => {
|
|
1312
|
+
return schema.type === "object" && typeof schema.properties === "object";
|
|
1313
|
+
};
|
|
1314
|
+
var SmartDataMerger = class {
|
|
1315
|
+
schema;
|
|
1316
|
+
constructor(schema) {
|
|
1317
|
+
this.schema = schema;
|
|
1318
|
+
}
|
|
1319
|
+
merge(currentData, newData) {
|
|
1320
|
+
const merged = { ...currentData };
|
|
1321
|
+
const properties = this.schema.properties ?? {};
|
|
1322
|
+
for (const [key, propSchema] of Object.entries(properties)) {
|
|
1323
|
+
const currentValue = currentData[key];
|
|
1324
|
+
const newValue = newData[key];
|
|
1325
|
+
if (isArraySchema(propSchema)) {
|
|
1326
|
+
merged[key] = [
|
|
1327
|
+
...Array.isArray(currentValue) ? currentValue : [],
|
|
1328
|
+
...Array.isArray(newValue) ? newValue : []
|
|
1329
|
+
];
|
|
1330
|
+
continue;
|
|
1331
|
+
}
|
|
1332
|
+
if (isObjectSchema(propSchema)) {
|
|
1333
|
+
merged[key] = {
|
|
1334
|
+
...typeof currentValue === "object" && currentValue ? currentValue : {},
|
|
1335
|
+
...typeof newValue === "object" && newValue ? newValue : {}
|
|
1336
|
+
};
|
|
1337
|
+
continue;
|
|
1338
|
+
}
|
|
1339
|
+
if (newValue !== void 0 && newValue !== null && newValue !== "") {
|
|
1340
|
+
merged[key] = newValue;
|
|
1341
|
+
} else if (currentValue !== void 0) {
|
|
1342
|
+
merged[key] = currentValue;
|
|
1343
|
+
}
|
|
1344
|
+
}
|
|
1345
|
+
return merged;
|
|
1346
|
+
}
|
|
1347
|
+
};
|
|
1348
|
+
|
|
1349
|
+
// src/merge/Deduplicator.ts
|
|
1350
|
+
var fnv1a32 = (str) => {
|
|
1351
|
+
let hash = 2166136261;
|
|
1352
|
+
for (let i = 0; i < str.length; i++) {
|
|
1353
|
+
hash ^= str.charCodeAt(i);
|
|
1354
|
+
hash = Math.imul(hash, 16777619);
|
|
1355
|
+
}
|
|
1356
|
+
return hash >>> 0;
|
|
1357
|
+
};
|
|
1358
|
+
var stableStringify = (value) => {
|
|
1359
|
+
if (value === null || typeof value !== "object") {
|
|
1360
|
+
return JSON.stringify(value);
|
|
1361
|
+
}
|
|
1362
|
+
if (Array.isArray(value)) {
|
|
1363
|
+
return `[${value.map((item) => stableStringify(item)).join(",")}]`;
|
|
1364
|
+
}
|
|
1365
|
+
const entries = Object.entries(value).sort(([a], [b]) => a.localeCompare(b)).map(([key, val]) => `"${key}":${stableStringify(val)}`);
|
|
1366
|
+
return `{${entries.join(",")}}`;
|
|
1367
|
+
};
|
|
1368
|
+
var findExactDuplicatesWithHashing = (items) => {
|
|
1369
|
+
const seen = /* @__PURE__ */ new Map();
|
|
1370
|
+
const duplicates = [];
|
|
1371
|
+
items.forEach((item, index) => {
|
|
1372
|
+
const hash = fnv1a32(stableStringify(item));
|
|
1373
|
+
if (seen.has(hash)) {
|
|
1374
|
+
duplicates.push(index);
|
|
1375
|
+
return;
|
|
1376
|
+
}
|
|
1377
|
+
seen.set(hash, index);
|
|
1378
|
+
});
|
|
1379
|
+
return duplicates;
|
|
1380
|
+
};
|
|
1381
|
+
var deduplicateByIndices = (items, indices) => {
|
|
1382
|
+
const remove = new Set(indices);
|
|
1383
|
+
return items.filter((_, index) => !remove.has(index));
|
|
1384
|
+
};
|
|
1385
|
+
|
|
1386
|
+
// src/strategies/ParallelAutoMergeStrategy.ts
|
|
1387
|
+
var dedupeSchema = {
|
|
1388
|
+
type: "object",
|
|
1389
|
+
properties: {
|
|
1390
|
+
keys: { type: "array", items: { type: "string" } }
|
|
1391
|
+
},
|
|
1392
|
+
required: ["keys"],
|
|
1393
|
+
additionalProperties: false
|
|
1394
|
+
};
|
|
1395
|
+
var dedupeArrays = (data) => {
|
|
1396
|
+
const result = { ...data };
|
|
1397
|
+
for (const [key, value] of Object.entries(result)) {
|
|
1398
|
+
if (Array.isArray(value)) {
|
|
1399
|
+
const duplicates = findExactDuplicatesWithHashing(value);
|
|
1400
|
+
result[key] = deduplicateByIndices(value, duplicates);
|
|
1401
|
+
}
|
|
1402
|
+
}
|
|
1403
|
+
return result;
|
|
1404
|
+
};
|
|
1405
|
+
var removeByPath = (data, path) => {
|
|
1406
|
+
const [root, indexStr] = path.split(".");
|
|
1407
|
+
const index = Number(indexStr);
|
|
1408
|
+
if (!root || Number.isNaN(index)) {
|
|
1409
|
+
return data;
|
|
1410
|
+
}
|
|
1411
|
+
const value = data[root];
|
|
1412
|
+
if (!Array.isArray(value)) {
|
|
1413
|
+
return data;
|
|
1414
|
+
}
|
|
1415
|
+
const next = [...value];
|
|
1416
|
+
next.splice(index, 1);
|
|
1417
|
+
return { ...data, [root]: next };
|
|
1418
|
+
};
|
|
1419
|
+
var ParallelAutoMergeStrategy = class {
|
|
1420
|
+
name = "parallel-auto-merge";
|
|
1421
|
+
config;
|
|
1422
|
+
constructor(config) {
|
|
1423
|
+
this.config = config;
|
|
1424
|
+
}
|
|
1425
|
+
getEstimatedSteps(artifacts) {
|
|
1426
|
+
const batches = getBatches(artifacts, {
|
|
1427
|
+
maxTokens: this.config.chunkSize,
|
|
1428
|
+
maxImages: this.config.maxImages
|
|
1429
|
+
});
|
|
1430
|
+
return batches.length + 3;
|
|
1431
|
+
}
|
|
1432
|
+
async run(options) {
|
|
1433
|
+
const debug = options.debug;
|
|
1434
|
+
const { telemetry } = options;
|
|
1435
|
+
const strategySpan = telemetry?.startSpan({
|
|
1436
|
+
name: "strategy.parallel-auto-merge",
|
|
1437
|
+
kind: "CHAIN",
|
|
1438
|
+
attributes: {
|
|
1439
|
+
"strategy.name": this.name,
|
|
1440
|
+
"strategy.artifacts.count": options.artifacts.length,
|
|
1441
|
+
"strategy.chunk_size": this.config.chunkSize,
|
|
1442
|
+
"strategy.concurrency": this.config.concurrency
|
|
1443
|
+
}
|
|
1444
|
+
});
|
|
1445
|
+
const batches = getBatches(
|
|
1446
|
+
options.artifacts,
|
|
1447
|
+
{
|
|
1448
|
+
maxTokens: this.config.chunkSize,
|
|
1449
|
+
maxImages: this.config.maxImages
|
|
1450
|
+
},
|
|
1451
|
+
debug,
|
|
1452
|
+
telemetry ?? void 0,
|
|
1453
|
+
strategySpan
|
|
1454
|
+
);
|
|
1455
|
+
const schema = serializeSchema(options.schema);
|
|
1456
|
+
const totalSteps = this.getEstimatedSteps(options.artifacts);
|
|
1457
|
+
let step = 1;
|
|
1458
|
+
const tasks = batches.map((batch, index) => async () => {
|
|
1459
|
+
const prompt = buildExtractorPrompt(
|
|
1460
|
+
batch,
|
|
1461
|
+
schema,
|
|
1462
|
+
this.config.outputInstructions
|
|
1463
|
+
);
|
|
1464
|
+
const result = await extractWithPrompt({
|
|
1465
|
+
model: this.config.model,
|
|
1466
|
+
schema: options.schema,
|
|
1467
|
+
system: prompt.system,
|
|
1468
|
+
user: prompt.user,
|
|
1469
|
+
artifacts: batch,
|
|
1470
|
+
events: options.events,
|
|
1471
|
+
execute: this.config.execute,
|
|
1472
|
+
strict: options.strict ?? this.config.strict,
|
|
1473
|
+
debug,
|
|
1474
|
+
callId: `parallel_auto_batch_${index + 1}`,
|
|
1475
|
+
telemetry: telemetry ?? void 0,
|
|
1476
|
+
parentSpan: strategySpan
|
|
1477
|
+
});
|
|
1478
|
+
step += 1;
|
|
1479
|
+
await options.events?.onStep?.({
|
|
1480
|
+
step,
|
|
1481
|
+
total: totalSteps,
|
|
1482
|
+
label: `batch ${index + 1}/${batches.length}`
|
|
1483
|
+
});
|
|
1484
|
+
debug?.step({
|
|
1485
|
+
step,
|
|
1486
|
+
total: totalSteps,
|
|
1487
|
+
label: `batch ${index + 1}/${batches.length}`,
|
|
1488
|
+
strategy: this.name
|
|
1489
|
+
});
|
|
1490
|
+
return result;
|
|
1491
|
+
});
|
|
1492
|
+
const results = await runConcurrently(
|
|
1493
|
+
tasks,
|
|
1494
|
+
this.config.concurrency ?? batches.length
|
|
1495
|
+
);
|
|
1496
|
+
const merger = new SmartDataMerger(
|
|
1497
|
+
options.schema
|
|
1498
|
+
);
|
|
1499
|
+
let merged = {};
|
|
1500
|
+
debug?.mergeStart({
|
|
1501
|
+
mergeId: "parallel_auto_smart_merge",
|
|
1502
|
+
inputCount: results.length,
|
|
1503
|
+
strategy: this.name
|
|
1504
|
+
});
|
|
1505
|
+
const mergeSpan = telemetry?.startSpan({
|
|
1506
|
+
name: "struktur.smart_merge",
|
|
1507
|
+
kind: "CHAIN",
|
|
1508
|
+
parentSpan: strategySpan,
|
|
1509
|
+
attributes: {
|
|
1510
|
+
"merge.strategy": "smart",
|
|
1511
|
+
"merge.input_count": results.length
|
|
1512
|
+
}
|
|
1513
|
+
});
|
|
1514
|
+
for (let i = 0; i < results.length; i++) {
|
|
1515
|
+
const result = results[i];
|
|
1516
|
+
const prevSize = Object.keys(merged).length;
|
|
1517
|
+
merged = merger.merge(merged, result.data);
|
|
1518
|
+
const newSize = Object.keys(merged).length;
|
|
1519
|
+
for (const key of Object.keys(result.data)) {
|
|
1520
|
+
const leftArray = Array.isArray(merged[key]) ? merged[key].length : void 0;
|
|
1521
|
+
const rightArray = Array.isArray(
|
|
1522
|
+
result.data[key]
|
|
1523
|
+
) ? result.data[key].length : void 0;
|
|
1524
|
+
debug?.smartMergeField({
|
|
1525
|
+
mergeId: "parallel_auto_smart_merge",
|
|
1526
|
+
field: key,
|
|
1527
|
+
operation: "merge_arrays",
|
|
1528
|
+
leftCount: leftArray,
|
|
1529
|
+
rightCount: rightArray
|
|
1530
|
+
});
|
|
1531
|
+
if (mergeSpan && telemetry) {
|
|
1532
|
+
telemetry.recordEvent(mergeSpan, {
|
|
1533
|
+
type: "merge",
|
|
1534
|
+
strategy: "smart",
|
|
1535
|
+
inputCount: rightArray ?? 1,
|
|
1536
|
+
outputCount: leftArray ?? 1
|
|
1537
|
+
});
|
|
1538
|
+
}
|
|
1539
|
+
}
|
|
1540
|
+
}
|
|
1541
|
+
debug?.mergeComplete({
|
|
1542
|
+
mergeId: "parallel_auto_smart_merge",
|
|
1543
|
+
success: true
|
|
1544
|
+
});
|
|
1545
|
+
if (mergeSpan && telemetry) {
|
|
1546
|
+
telemetry.endSpan(mergeSpan, {
|
|
1547
|
+
status: "ok",
|
|
1548
|
+
output: merged
|
|
1549
|
+
});
|
|
1550
|
+
}
|
|
1551
|
+
merged = dedupeArrays(merged);
|
|
1552
|
+
const exactDedupeSpan = telemetry?.startSpan({
|
|
1553
|
+
name: "struktur.exact_dedupe",
|
|
1554
|
+
kind: "CHAIN",
|
|
1555
|
+
parentSpan: strategySpan,
|
|
1556
|
+
attributes: {
|
|
1557
|
+
"dedupe.method": "exact_hashing"
|
|
1558
|
+
}
|
|
1559
|
+
});
|
|
1560
|
+
if (exactDedupeSpan && telemetry) {
|
|
1561
|
+
telemetry.recordEvent(exactDedupeSpan, {
|
|
1562
|
+
type: "merge",
|
|
1563
|
+
strategy: "exact_hash_dedupe",
|
|
1564
|
+
inputCount: Object.keys(merged).length,
|
|
1565
|
+
outputCount: Object.keys(merged).length
|
|
1566
|
+
});
|
|
1567
|
+
telemetry.endSpan(exactDedupeSpan, {
|
|
1568
|
+
status: "ok",
|
|
1569
|
+
output: merged
|
|
1570
|
+
});
|
|
1571
|
+
}
|
|
1572
|
+
const dedupePrompt = buildDeduplicationPrompt(schema, merged);
|
|
1573
|
+
debug?.dedupeStart({
|
|
1574
|
+
dedupeId: "parallel_auto_dedupe",
|
|
1575
|
+
itemCount: Object.keys(merged).length
|
|
1576
|
+
});
|
|
1577
|
+
const llmDedupeSpan = telemetry?.startSpan({
|
|
1578
|
+
name: "struktur.llm_dedupe",
|
|
1579
|
+
kind: "CHAIN",
|
|
1580
|
+
parentSpan: strategySpan,
|
|
1581
|
+
attributes: {
|
|
1582
|
+
"dedupe.method": "llm"
|
|
1583
|
+
}
|
|
1584
|
+
});
|
|
1585
|
+
const dedupeResponse = await runWithRetries({
|
|
1586
|
+
model: this.config.dedupeModel ?? this.config.model,
|
|
1587
|
+
schema: dedupeSchema,
|
|
1588
|
+
system: dedupePrompt.system,
|
|
1589
|
+
user: dedupePrompt.user,
|
|
1590
|
+
events: options.events,
|
|
1591
|
+
execute: this.config.dedupeExecute,
|
|
1592
|
+
strict: this.config.strict,
|
|
1593
|
+
debug,
|
|
1594
|
+
callId: "parallel_auto_dedupe",
|
|
1595
|
+
telemetry: telemetry ?? void 0,
|
|
1596
|
+
parentSpan: llmDedupeSpan
|
|
1597
|
+
});
|
|
1598
|
+
step += 1;
|
|
1599
|
+
await options.events?.onStep?.({
|
|
1600
|
+
step,
|
|
1601
|
+
total: totalSteps,
|
|
1602
|
+
label: "dedupe"
|
|
1603
|
+
});
|
|
1604
|
+
debug?.step({
|
|
1605
|
+
step,
|
|
1606
|
+
total: totalSteps,
|
|
1607
|
+
label: "dedupe",
|
|
1608
|
+
strategy: this.name
|
|
1609
|
+
});
|
|
1610
|
+
let deduped = merged;
|
|
1611
|
+
for (const key of dedupeResponse.data.keys) {
|
|
1612
|
+
deduped = removeByPath(deduped, key);
|
|
1613
|
+
}
|
|
1614
|
+
debug?.dedupeComplete({
|
|
1615
|
+
dedupeId: "parallel_auto_dedupe",
|
|
1616
|
+
duplicatesFound: dedupeResponse.data.keys.length,
|
|
1617
|
+
itemsRemoved: dedupeResponse.data.keys.length
|
|
1618
|
+
});
|
|
1619
|
+
if (llmDedupeSpan && telemetry) {
|
|
1620
|
+
telemetry.recordEvent(llmDedupeSpan, {
|
|
1621
|
+
type: "merge",
|
|
1622
|
+
strategy: "llm_dedupe",
|
|
1623
|
+
inputCount: Object.keys(merged).length,
|
|
1624
|
+
outputCount: Object.keys(deduped).length,
|
|
1625
|
+
deduped: dedupeResponse.data.keys.length
|
|
1626
|
+
});
|
|
1627
|
+
telemetry.endSpan(llmDedupeSpan, {
|
|
1628
|
+
status: "ok",
|
|
1629
|
+
output: deduped
|
|
1630
|
+
});
|
|
1631
|
+
}
|
|
1632
|
+
telemetry?.endSpan(strategySpan, {
|
|
1633
|
+
status: "ok",
|
|
1634
|
+
output: deduped
|
|
1635
|
+
});
|
|
1636
|
+
return {
|
|
1637
|
+
data: deduped,
|
|
1638
|
+
usage: mergeUsage([...results.map((r) => r.usage), dedupeResponse.usage])
|
|
1639
|
+
};
|
|
1640
|
+
}
|
|
1641
|
+
};
|
|
1642
|
+
var parallelAutoMerge = (config) => {
|
|
1643
|
+
return new ParallelAutoMergeStrategy(config);
|
|
1644
|
+
};
|
|
1645
|
+
|
|
1646
|
+
// src/strategies/SequentialAutoMergeStrategy.ts
|
|
1647
|
+
var dedupeSchema2 = {
|
|
1648
|
+
type: "object",
|
|
1649
|
+
properties: {
|
|
1650
|
+
keys: { type: "array", items: { type: "string" } }
|
|
1651
|
+
},
|
|
1652
|
+
required: ["keys"],
|
|
1653
|
+
additionalProperties: false
|
|
1654
|
+
};
|
|
1655
|
+
var dedupeArrays2 = (data) => {
|
|
1656
|
+
const result = { ...data };
|
|
1657
|
+
for (const [key, value] of Object.entries(result)) {
|
|
1658
|
+
if (Array.isArray(value)) {
|
|
1659
|
+
const duplicates = findExactDuplicatesWithHashing(value);
|
|
1660
|
+
result[key] = deduplicateByIndices(value, duplicates);
|
|
1661
|
+
}
|
|
1662
|
+
}
|
|
1663
|
+
return result;
|
|
1664
|
+
};
|
|
1665
|
+
var removeByPath2 = (data, path) => {
|
|
1666
|
+
const [root, indexStr] = path.split(".");
|
|
1667
|
+
const index = Number(indexStr);
|
|
1668
|
+
if (!root || Number.isNaN(index)) {
|
|
1669
|
+
return data;
|
|
1670
|
+
}
|
|
1671
|
+
const value = data[root];
|
|
1672
|
+
if (!Array.isArray(value)) {
|
|
1673
|
+
return data;
|
|
1674
|
+
}
|
|
1675
|
+
const next = [...value];
|
|
1676
|
+
next.splice(index, 1);
|
|
1677
|
+
return { ...data, [root]: next };
|
|
1678
|
+
};
|
|
1679
|
+
var SequentialAutoMergeStrategy = class {
|
|
1680
|
+
name = "sequential-auto-merge";
|
|
1681
|
+
config;
|
|
1682
|
+
constructor(config) {
|
|
1683
|
+
this.config = config;
|
|
1684
|
+
}
|
|
1685
|
+
getEstimatedSteps(artifacts) {
|
|
1686
|
+
const batches = getBatches(artifacts, {
|
|
1687
|
+
maxTokens: this.config.chunkSize,
|
|
1688
|
+
maxImages: this.config.maxImages
|
|
1689
|
+
});
|
|
1690
|
+
return batches.length + 3;
|
|
1691
|
+
}
|
|
1692
|
+
async run(options) {
|
|
1693
|
+
const debug = options.debug;
|
|
1694
|
+
const { telemetry } = options;
|
|
1695
|
+
const strategySpan = telemetry?.startSpan({
|
|
1696
|
+
name: "strategy.sequential-auto-merge",
|
|
1697
|
+
kind: "CHAIN",
|
|
1698
|
+
attributes: {
|
|
1699
|
+
"strategy.name": this.name,
|
|
1700
|
+
"strategy.artifacts.count": options.artifacts.length,
|
|
1701
|
+
"strategy.chunk_size": this.config.chunkSize
|
|
1702
|
+
}
|
|
1703
|
+
});
|
|
1704
|
+
const batches = getBatches(
|
|
1705
|
+
options.artifacts,
|
|
1706
|
+
{
|
|
1707
|
+
maxTokens: this.config.chunkSize,
|
|
1708
|
+
maxImages: this.config.maxImages
|
|
1709
|
+
},
|
|
1710
|
+
debug,
|
|
1711
|
+
telemetry ?? void 0,
|
|
1712
|
+
strategySpan
|
|
1713
|
+
);
|
|
1714
|
+
const schema = serializeSchema(options.schema);
|
|
1715
|
+
const merger = new SmartDataMerger(
|
|
1716
|
+
options.schema
|
|
1717
|
+
);
|
|
1718
|
+
let merged = {};
|
|
1719
|
+
const usages = [];
|
|
1720
|
+
const totalSteps = this.getEstimatedSteps(options.artifacts);
|
|
1721
|
+
let step = 1;
|
|
1722
|
+
debug?.mergeStart({
|
|
1723
|
+
mergeId: "sequential_auto_merge",
|
|
1724
|
+
inputCount: batches.length,
|
|
1725
|
+
strategy: this.name
|
|
1726
|
+
});
|
|
1727
|
+
const mergeSpan = telemetry?.startSpan({
|
|
1728
|
+
name: "struktur.smart_merge",
|
|
1729
|
+
kind: "CHAIN",
|
|
1730
|
+
parentSpan: strategySpan,
|
|
1731
|
+
attributes: {
|
|
1732
|
+
"merge.strategy": "smart",
|
|
1733
|
+
"merge.input_count": batches.length
|
|
1734
|
+
}
|
|
1735
|
+
});
|
|
1736
|
+
for (const [index, batch] of batches.entries()) {
|
|
1737
|
+
const prompt = buildExtractorPrompt(
|
|
1738
|
+
batch,
|
|
1739
|
+
schema,
|
|
1740
|
+
this.config.outputInstructions
|
|
1741
|
+
);
|
|
1742
|
+
const result = await extractWithPrompt({
|
|
1743
|
+
model: this.config.model,
|
|
1744
|
+
schema: options.schema,
|
|
1745
|
+
system: prompt.system,
|
|
1746
|
+
user: prompt.user,
|
|
1747
|
+
artifacts: batch,
|
|
1748
|
+
events: options.events,
|
|
1749
|
+
execute: this.config.execute,
|
|
1750
|
+
strict: options.strict ?? this.config.strict,
|
|
1751
|
+
debug,
|
|
1752
|
+
callId: `sequential_auto_batch_${index + 1}`,
|
|
1753
|
+
telemetry: telemetry ?? void 0,
|
|
1754
|
+
parentSpan: mergeSpan
|
|
1755
|
+
});
|
|
1756
|
+
merged = merger.merge(merged, result.data);
|
|
1757
|
+
usages.push(result.usage);
|
|
1758
|
+
for (const key of Object.keys(result.data)) {
|
|
1759
|
+
const leftArray = Array.isArray(merged[key]) ? merged[key].length : void 0;
|
|
1760
|
+
const rightArray = Array.isArray(
|
|
1761
|
+
result.data[key]
|
|
1762
|
+
) ? result.data[key].length : void 0;
|
|
1763
|
+
debug?.smartMergeField({
|
|
1764
|
+
mergeId: "sequential_auto_merge",
|
|
1765
|
+
field: key,
|
|
1766
|
+
operation: "merge_arrays",
|
|
1767
|
+
leftCount: leftArray,
|
|
1768
|
+
rightCount: rightArray
|
|
1769
|
+
});
|
|
1770
|
+
if (mergeSpan && telemetry) {
|
|
1771
|
+
telemetry.recordEvent(mergeSpan, {
|
|
1772
|
+
type: "merge",
|
|
1773
|
+
strategy: "smart",
|
|
1774
|
+
inputCount: rightArray ?? 1,
|
|
1775
|
+
outputCount: leftArray ?? 1
|
|
1776
|
+
});
|
|
1777
|
+
}
|
|
1778
|
+
}
|
|
1779
|
+
step += 1;
|
|
1780
|
+
await options.events?.onStep?.({
|
|
1781
|
+
step,
|
|
1782
|
+
total: totalSteps,
|
|
1783
|
+
label: `batch ${index + 1}/${batches.length}`
|
|
1784
|
+
});
|
|
1785
|
+
debug?.step({
|
|
1786
|
+
step,
|
|
1787
|
+
total: totalSteps,
|
|
1788
|
+
label: `batch ${index + 1}/${batches.length}`,
|
|
1789
|
+
strategy: this.name
|
|
1790
|
+
});
|
|
1791
|
+
}
|
|
1792
|
+
debug?.mergeComplete({ mergeId: "sequential_auto_merge", success: true });
|
|
1793
|
+
if (mergeSpan && telemetry) {
|
|
1794
|
+
telemetry.endSpan(mergeSpan, {
|
|
1795
|
+
status: "ok",
|
|
1796
|
+
output: merged
|
|
1797
|
+
});
|
|
1798
|
+
}
|
|
1799
|
+
merged = dedupeArrays2(merged);
|
|
1800
|
+
const exactDedupeSpan = telemetry?.startSpan({
|
|
1801
|
+
name: "struktur.exact_dedupe",
|
|
1802
|
+
kind: "CHAIN",
|
|
1803
|
+
parentSpan: strategySpan,
|
|
1804
|
+
attributes: {
|
|
1805
|
+
"dedupe.method": "exact_hashing"
|
|
1806
|
+
}
|
|
1807
|
+
});
|
|
1808
|
+
if (exactDedupeSpan && telemetry) {
|
|
1809
|
+
telemetry.recordEvent(exactDedupeSpan, {
|
|
1810
|
+
type: "merge",
|
|
1811
|
+
strategy: "exact_hash_dedupe",
|
|
1812
|
+
inputCount: Object.keys(merged).length,
|
|
1813
|
+
outputCount: Object.keys(merged).length
|
|
1814
|
+
});
|
|
1815
|
+
telemetry.endSpan(exactDedupeSpan, {
|
|
1816
|
+
status: "ok",
|
|
1817
|
+
output: merged
|
|
1818
|
+
});
|
|
1819
|
+
}
|
|
1820
|
+
const dedupePrompt = buildDeduplicationPrompt(schema, merged);
|
|
1821
|
+
debug?.dedupeStart({
|
|
1822
|
+
dedupeId: "sequential_auto_dedupe",
|
|
1823
|
+
itemCount: Object.keys(merged).length
|
|
1824
|
+
});
|
|
1825
|
+
const llmDedupeSpan = telemetry?.startSpan({
|
|
1826
|
+
name: "struktur.llm_dedupe",
|
|
1827
|
+
kind: "CHAIN",
|
|
1828
|
+
parentSpan: strategySpan,
|
|
1829
|
+
attributes: {
|
|
1830
|
+
"dedupe.method": "llm"
|
|
1831
|
+
}
|
|
1832
|
+
});
|
|
1833
|
+
const dedupeResponse = await runWithRetries({
|
|
1834
|
+
model: this.config.dedupeModel ?? this.config.model,
|
|
1835
|
+
schema: dedupeSchema2,
|
|
1836
|
+
system: dedupePrompt.system,
|
|
1837
|
+
user: dedupePrompt.user,
|
|
1838
|
+
events: options.events,
|
|
1839
|
+
execute: this.config.dedupeExecute,
|
|
1840
|
+
strict: this.config.strict,
|
|
1841
|
+
debug,
|
|
1842
|
+
callId: "sequential_auto_dedupe",
|
|
1843
|
+
telemetry: telemetry ?? void 0,
|
|
1844
|
+
parentSpan: llmDedupeSpan
|
|
1845
|
+
});
|
|
1846
|
+
step += 1;
|
|
1847
|
+
await options.events?.onStep?.({
|
|
1848
|
+
step,
|
|
1849
|
+
total: totalSteps,
|
|
1850
|
+
label: "dedupe"
|
|
1851
|
+
});
|
|
1852
|
+
debug?.step({
|
|
1853
|
+
step,
|
|
1854
|
+
total: totalSteps,
|
|
1855
|
+
label: "dedupe",
|
|
1856
|
+
strategy: this.name
|
|
1857
|
+
});
|
|
1858
|
+
let deduped = merged;
|
|
1859
|
+
for (const key of dedupeResponse.data.keys) {
|
|
1860
|
+
deduped = removeByPath2(deduped, key);
|
|
1861
|
+
}
|
|
1862
|
+
debug?.dedupeComplete({
|
|
1863
|
+
dedupeId: "sequential_auto_dedupe",
|
|
1864
|
+
duplicatesFound: dedupeResponse.data.keys.length,
|
|
1865
|
+
itemsRemoved: dedupeResponse.data.keys.length
|
|
1866
|
+
});
|
|
1867
|
+
if (llmDedupeSpan && telemetry) {
|
|
1868
|
+
telemetry.recordEvent(llmDedupeSpan, {
|
|
1869
|
+
type: "merge",
|
|
1870
|
+
strategy: "llm_dedupe",
|
|
1871
|
+
inputCount: Object.keys(merged).length,
|
|
1872
|
+
outputCount: Object.keys(deduped).length,
|
|
1873
|
+
deduped: dedupeResponse.data.keys.length
|
|
1874
|
+
});
|
|
1875
|
+
telemetry.endSpan(llmDedupeSpan, {
|
|
1876
|
+
status: "ok",
|
|
1877
|
+
output: deduped
|
|
1878
|
+
});
|
|
1879
|
+
}
|
|
1880
|
+
telemetry?.endSpan(strategySpan, {
|
|
1881
|
+
status: "ok",
|
|
1882
|
+
output: deduped
|
|
1883
|
+
});
|
|
1884
|
+
return {
|
|
1885
|
+
data: deduped,
|
|
1886
|
+
usage: mergeUsage([...usages, dedupeResponse.usage])
|
|
1887
|
+
};
|
|
1888
|
+
}
|
|
1889
|
+
};
|
|
1890
|
+
var sequentialAutoMerge = (config) => {
|
|
1891
|
+
return new SequentialAutoMergeStrategy(config);
|
|
1892
|
+
};
|
|
1893
|
+
|
|
1894
|
+
// src/strategies/DoublePassStrategy.ts
|
|
1895
|
+
var DoublePassStrategy = class {
|
|
1896
|
+
name = "double-pass";
|
|
1897
|
+
config;
|
|
1898
|
+
constructor(config) {
|
|
1899
|
+
this.config = config;
|
|
1900
|
+
}
|
|
1901
|
+
getEstimatedSteps(artifacts) {
|
|
1902
|
+
const batches = getBatches(artifacts, {
|
|
1903
|
+
maxTokens: this.config.chunkSize,
|
|
1904
|
+
maxImages: this.config.maxImages
|
|
1905
|
+
});
|
|
1906
|
+
return batches.length * 2 + 3;
|
|
1907
|
+
}
|
|
1908
|
+
async run(options) {
|
|
1909
|
+
const debug = options.debug;
|
|
1910
|
+
const { telemetry } = options;
|
|
1911
|
+
const strategySpan = telemetry?.startSpan({
|
|
1912
|
+
name: "strategy.double-pass",
|
|
1913
|
+
kind: "CHAIN",
|
|
1914
|
+
attributes: {
|
|
1915
|
+
"strategy.name": this.name,
|
|
1916
|
+
"strategy.artifacts.count": options.artifacts.length,
|
|
1917
|
+
"strategy.chunk_size": this.config.chunkSize,
|
|
1918
|
+
"strategy.concurrency": this.config.concurrency
|
|
1919
|
+
}
|
|
1920
|
+
});
|
|
1921
|
+
const batches = getBatches(
|
|
1922
|
+
options.artifacts,
|
|
1923
|
+
{
|
|
1924
|
+
maxTokens: this.config.chunkSize,
|
|
1925
|
+
maxImages: this.config.maxImages
|
|
1926
|
+
},
|
|
1927
|
+
debug,
|
|
1928
|
+
telemetry ?? void 0,
|
|
1929
|
+
strategySpan
|
|
1930
|
+
);
|
|
1931
|
+
const schema = serializeSchema(options.schema);
|
|
1932
|
+
const totalSteps = this.getEstimatedSteps(options.artifacts);
|
|
1933
|
+
let step = 1;
|
|
1934
|
+
const pass1Span = telemetry?.startSpan({
|
|
1935
|
+
name: "struktur.pass_1",
|
|
1936
|
+
kind: "CHAIN",
|
|
1937
|
+
parentSpan: strategySpan,
|
|
1938
|
+
attributes: {
|
|
1939
|
+
"pass.number": 1,
|
|
1940
|
+
"pass.type": "parallel_extraction"
|
|
1941
|
+
}
|
|
1942
|
+
});
|
|
1943
|
+
const tasks = batches.map((batch, index) => async () => {
|
|
1944
|
+
const prompt = buildExtractorPrompt(
|
|
1945
|
+
batch,
|
|
1946
|
+
schema,
|
|
1947
|
+
this.config.outputInstructions
|
|
1948
|
+
);
|
|
1949
|
+
const result = await extractWithPrompt({
|
|
1950
|
+
model: this.config.model,
|
|
1951
|
+
schema: options.schema,
|
|
1952
|
+
system: prompt.system,
|
|
1953
|
+
user: prompt.user,
|
|
1954
|
+
artifacts: batch,
|
|
1955
|
+
events: options.events,
|
|
1956
|
+
execute: this.config.execute,
|
|
1957
|
+
strict: options.strict ?? this.config.strict,
|
|
1958
|
+
debug,
|
|
1959
|
+
callId: `double_pass_1_batch_${index + 1}`,
|
|
1960
|
+
telemetry: telemetry ?? void 0,
|
|
1961
|
+
parentSpan: pass1Span
|
|
1962
|
+
});
|
|
1963
|
+
step += 1;
|
|
1964
|
+
await options.events?.onStep?.({
|
|
1965
|
+
step,
|
|
1966
|
+
total: totalSteps,
|
|
1967
|
+
label: `pass 1 batch ${index + 1}/${batches.length}`
|
|
1968
|
+
});
|
|
1969
|
+
debug?.step({
|
|
1970
|
+
step,
|
|
1971
|
+
total: totalSteps,
|
|
1972
|
+
label: `pass 1 batch ${index + 1}/${batches.length}`,
|
|
1973
|
+
strategy: this.name
|
|
1974
|
+
});
|
|
1975
|
+
return result;
|
|
1976
|
+
});
|
|
1977
|
+
const results = await runConcurrently(
|
|
1978
|
+
tasks,
|
|
1979
|
+
this.config.concurrency ?? batches.length
|
|
1980
|
+
);
|
|
1981
|
+
debug?.mergeStart({
|
|
1982
|
+
mergeId: "double_pass_1_merge",
|
|
1983
|
+
inputCount: results.length,
|
|
1984
|
+
strategy: this.name
|
|
1985
|
+
});
|
|
1986
|
+
const pass1MergeSpan = telemetry?.startSpan({
|
|
1987
|
+
name: "struktur.pass_1_merge",
|
|
1988
|
+
kind: "CHAIN",
|
|
1989
|
+
parentSpan: pass1Span,
|
|
1990
|
+
attributes: {
|
|
1991
|
+
"merge.strategy": "parallel",
|
|
1992
|
+
"merge.input_count": results.length
|
|
1993
|
+
}
|
|
1994
|
+
});
|
|
1995
|
+
const mergePrompt = buildParallelMergerPrompt(
|
|
1996
|
+
schema,
|
|
1997
|
+
results.map((r) => r.data)
|
|
1998
|
+
);
|
|
1999
|
+
const merged = await extractWithPrompt({
|
|
2000
|
+
model: this.config.mergeModel,
|
|
2001
|
+
schema: options.schema,
|
|
2002
|
+
system: mergePrompt.system,
|
|
2003
|
+
user: mergePrompt.user,
|
|
2004
|
+
artifacts: [],
|
|
2005
|
+
events: options.events,
|
|
2006
|
+
execute: this.config.execute,
|
|
2007
|
+
strict: this.config.strict,
|
|
2008
|
+
debug,
|
|
2009
|
+
callId: "double_pass_1_merge",
|
|
2010
|
+
telemetry: telemetry ?? void 0,
|
|
2011
|
+
parentSpan: pass1MergeSpan
|
|
2012
|
+
});
|
|
2013
|
+
step += 1;
|
|
2014
|
+
await options.events?.onStep?.({
|
|
2015
|
+
step,
|
|
2016
|
+
total: totalSteps,
|
|
2017
|
+
label: "pass 1 merge"
|
|
2018
|
+
});
|
|
2019
|
+
debug?.step({
|
|
2020
|
+
step,
|
|
2021
|
+
total: totalSteps,
|
|
2022
|
+
label: "pass 1 merge",
|
|
2023
|
+
strategy: this.name
|
|
2024
|
+
});
|
|
2025
|
+
debug?.mergeComplete({ mergeId: "double_pass_1_merge", success: true });
|
|
2026
|
+
if (pass1MergeSpan && telemetry) {
|
|
2027
|
+
telemetry.recordEvent(pass1MergeSpan, {
|
|
2028
|
+
type: "merge",
|
|
2029
|
+
strategy: "parallel",
|
|
2030
|
+
inputCount: results.length,
|
|
2031
|
+
outputCount: 1
|
|
2032
|
+
});
|
|
2033
|
+
telemetry.endSpan(pass1MergeSpan, {
|
|
2034
|
+
status: "ok",
|
|
2035
|
+
output: merged.data
|
|
2036
|
+
});
|
|
2037
|
+
}
|
|
2038
|
+
telemetry?.endSpan(pass1Span, {
|
|
2039
|
+
status: "ok",
|
|
2040
|
+
output: merged.data
|
|
2041
|
+
});
|
|
2042
|
+
const pass2Span = telemetry?.startSpan({
|
|
2043
|
+
name: "struktur.pass_2",
|
|
2044
|
+
kind: "CHAIN",
|
|
2045
|
+
parentSpan: strategySpan,
|
|
2046
|
+
attributes: {
|
|
2047
|
+
"pass.number": 2,
|
|
2048
|
+
"pass.type": "sequential_refinement"
|
|
2049
|
+
}
|
|
2050
|
+
});
|
|
2051
|
+
let currentData = merged.data;
|
|
2052
|
+
const usages = [...results.map((r) => r.usage), merged.usage];
|
|
2053
|
+
for (const [index, batch] of batches.entries()) {
|
|
2054
|
+
const prompt = buildSequentialPrompt(
|
|
2055
|
+
batch,
|
|
2056
|
+
schema,
|
|
2057
|
+
JSON.stringify(currentData),
|
|
2058
|
+
this.config.outputInstructions
|
|
2059
|
+
);
|
|
2060
|
+
const result = await extractWithPrompt({
|
|
2061
|
+
model: this.config.model,
|
|
2062
|
+
schema: options.schema,
|
|
2063
|
+
system: prompt.system,
|
|
2064
|
+
user: prompt.user,
|
|
2065
|
+
artifacts: batch,
|
|
2066
|
+
events: options.events,
|
|
2067
|
+
execute: this.config.execute,
|
|
2068
|
+
strict: this.config.strict,
|
|
2069
|
+
debug,
|
|
2070
|
+
callId: `double_pass_2_batch_${index + 1}`,
|
|
2071
|
+
telemetry: telemetry ?? void 0,
|
|
2072
|
+
parentSpan: pass2Span
|
|
2073
|
+
});
|
|
2074
|
+
currentData = result.data;
|
|
2075
|
+
usages.push(result.usage);
|
|
2076
|
+
step += 1;
|
|
2077
|
+
await options.events?.onStep?.({
|
|
2078
|
+
step,
|
|
2079
|
+
total: totalSteps,
|
|
2080
|
+
label: `pass 2 batch ${index + 1}/${batches.length}`
|
|
2081
|
+
});
|
|
2082
|
+
debug?.step({
|
|
2083
|
+
step,
|
|
2084
|
+
total: totalSteps,
|
|
2085
|
+
label: `pass 2 batch ${index + 1}/${batches.length}`,
|
|
2086
|
+
strategy: this.name
|
|
2087
|
+
});
|
|
2088
|
+
}
|
|
2089
|
+
telemetry?.endSpan(pass2Span, {
|
|
2090
|
+
status: "ok",
|
|
2091
|
+
output: currentData
|
|
2092
|
+
});
|
|
2093
|
+
telemetry?.endSpan(strategySpan, {
|
|
2094
|
+
status: "ok",
|
|
2095
|
+
output: currentData
|
|
2096
|
+
});
|
|
2097
|
+
return { data: currentData, usage: mergeUsage(usages) };
|
|
2098
|
+
}
|
|
2099
|
+
};
|
|
2100
|
+
var doublePass = (config) => {
|
|
2101
|
+
return new DoublePassStrategy(config);
|
|
2102
|
+
};
|
|
2103
|
+
|
|
2104
|
+
// src/strategies/DoublePassAutoMergeStrategy.ts
|
|
2105
|
+
var dedupeSchema3 = {
|
|
2106
|
+
type: "object",
|
|
2107
|
+
properties: {
|
|
2108
|
+
keys: { type: "array", items: { type: "string" } }
|
|
2109
|
+
},
|
|
2110
|
+
required: ["keys"],
|
|
2111
|
+
additionalProperties: false
|
|
2112
|
+
};
|
|
2113
|
+
var dedupeArrays3 = (data) => {
|
|
2114
|
+
const result = { ...data };
|
|
2115
|
+
for (const [key, value] of Object.entries(result)) {
|
|
2116
|
+
if (Array.isArray(value)) {
|
|
2117
|
+
const duplicates = findExactDuplicatesWithHashing(value);
|
|
2118
|
+
result[key] = deduplicateByIndices(value, duplicates);
|
|
2119
|
+
}
|
|
2120
|
+
}
|
|
2121
|
+
return result;
|
|
2122
|
+
};
|
|
2123
|
+
var removeByPath3 = (data, path) => {
|
|
2124
|
+
const [root, indexStr] = path.split(".");
|
|
2125
|
+
const index = Number(indexStr);
|
|
2126
|
+
if (!root || Number.isNaN(index)) {
|
|
2127
|
+
return data;
|
|
2128
|
+
}
|
|
2129
|
+
const value = data[root];
|
|
2130
|
+
if (!Array.isArray(value)) {
|
|
2131
|
+
return data;
|
|
2132
|
+
}
|
|
2133
|
+
const next = [...value];
|
|
2134
|
+
next.splice(index, 1);
|
|
2135
|
+
return { ...data, [root]: next };
|
|
2136
|
+
};
|
|
2137
|
+
var DoublePassAutoMergeStrategy = class {
|
|
2138
|
+
name = "double-pass-auto-merge";
|
|
2139
|
+
config;
|
|
2140
|
+
constructor(config) {
|
|
2141
|
+
this.config = config;
|
|
2142
|
+
}
|
|
2143
|
+
getEstimatedSteps(artifacts) {
|
|
2144
|
+
const batches = getBatches(artifacts, {
|
|
2145
|
+
maxTokens: this.config.chunkSize,
|
|
2146
|
+
maxImages: this.config.maxImages
|
|
2147
|
+
});
|
|
2148
|
+
return batches.length * 2 + 3;
|
|
2149
|
+
}
|
|
2150
|
+
async run(options) {
|
|
2151
|
+
const debug = options.debug;
|
|
2152
|
+
const { telemetry } = options;
|
|
2153
|
+
const strategySpan = telemetry?.startSpan({
|
|
2154
|
+
name: "strategy.double-pass-auto-merge",
|
|
2155
|
+
kind: "CHAIN",
|
|
2156
|
+
attributes: {
|
|
2157
|
+
"strategy.name": this.name,
|
|
2158
|
+
"strategy.artifacts.count": options.artifacts.length,
|
|
2159
|
+
"strategy.chunk_size": this.config.chunkSize,
|
|
2160
|
+
"strategy.concurrency": this.config.concurrency
|
|
2161
|
+
}
|
|
2162
|
+
});
|
|
2163
|
+
const batches = getBatches(
|
|
2164
|
+
options.artifacts,
|
|
2165
|
+
{
|
|
2166
|
+
maxTokens: this.config.chunkSize,
|
|
2167
|
+
maxImages: this.config.maxImages
|
|
2168
|
+
},
|
|
2169
|
+
debug,
|
|
2170
|
+
telemetry ?? void 0,
|
|
2171
|
+
strategySpan
|
|
2172
|
+
);
|
|
2173
|
+
const schema = serializeSchema(options.schema);
|
|
2174
|
+
const totalSteps = this.getEstimatedSteps(options.artifacts);
|
|
2175
|
+
let step = 1;
|
|
2176
|
+
const pass1Span = telemetry?.startSpan({
|
|
2177
|
+
name: "struktur.pass_1",
|
|
2178
|
+
kind: "CHAIN",
|
|
2179
|
+
parentSpan: strategySpan,
|
|
2180
|
+
attributes: {
|
|
2181
|
+
"pass.number": 1,
|
|
2182
|
+
"pass.type": "parallel_extraction"
|
|
2183
|
+
}
|
|
2184
|
+
});
|
|
2185
|
+
const tasks = batches.map((batch, index) => async () => {
|
|
2186
|
+
const prompt = buildExtractorPrompt(
|
|
2187
|
+
batch,
|
|
2188
|
+
schema,
|
|
2189
|
+
this.config.outputInstructions
|
|
2190
|
+
);
|
|
2191
|
+
const result = await extractWithPrompt({
|
|
2192
|
+
model: this.config.model,
|
|
2193
|
+
schema: options.schema,
|
|
2194
|
+
system: prompt.system,
|
|
2195
|
+
user: prompt.user,
|
|
2196
|
+
artifacts: batch,
|
|
2197
|
+
events: options.events,
|
|
2198
|
+
execute: this.config.execute,
|
|
2199
|
+
strict: options.strict ?? this.config.strict,
|
|
2200
|
+
debug,
|
|
2201
|
+
callId: `double_pass_auto_1_batch_${index + 1}`,
|
|
2202
|
+
telemetry: telemetry ?? void 0,
|
|
2203
|
+
parentSpan: pass1Span
|
|
2204
|
+
});
|
|
2205
|
+
step += 1;
|
|
2206
|
+
await options.events?.onStep?.({
|
|
2207
|
+
step,
|
|
2208
|
+
total: totalSteps,
|
|
2209
|
+
label: `pass 1 batch ${index + 1}/${batches.length}`
|
|
2210
|
+
});
|
|
2211
|
+
debug?.step({
|
|
2212
|
+
step,
|
|
2213
|
+
total: totalSteps,
|
|
2214
|
+
label: `pass 1 batch ${index + 1}/${batches.length}`,
|
|
2215
|
+
strategy: this.name
|
|
2216
|
+
});
|
|
2217
|
+
return result;
|
|
2218
|
+
});
|
|
2219
|
+
const results = await runConcurrently(
|
|
2220
|
+
tasks,
|
|
2221
|
+
this.config.concurrency ?? batches.length
|
|
2222
|
+
);
|
|
2223
|
+
const merger = new SmartDataMerger(
|
|
2224
|
+
options.schema
|
|
2225
|
+
);
|
|
2226
|
+
let merged = {};
|
|
2227
|
+
debug?.mergeStart({
|
|
2228
|
+
mergeId: "double_pass_auto_merge",
|
|
2229
|
+
inputCount: results.length,
|
|
2230
|
+
strategy: this.name
|
|
2231
|
+
});
|
|
2232
|
+
const mergeSpan = telemetry?.startSpan({
|
|
2233
|
+
name: "struktur.smart_merge",
|
|
2234
|
+
kind: "CHAIN",
|
|
2235
|
+
parentSpan: pass1Span,
|
|
2236
|
+
attributes: {
|
|
2237
|
+
"merge.strategy": "smart",
|
|
2238
|
+
"merge.input_count": results.length
|
|
2239
|
+
}
|
|
2240
|
+
});
|
|
2241
|
+
for (let i = 0; i < results.length; i++) {
|
|
2242
|
+
const result = results[i];
|
|
2243
|
+
merged = merger.merge(merged, result.data);
|
|
2244
|
+
for (const key of Object.keys(result.data)) {
|
|
2245
|
+
const leftArray = Array.isArray(merged[key]) ? merged[key].length : void 0;
|
|
2246
|
+
const rightArray = Array.isArray(
|
|
2247
|
+
result.data[key]
|
|
2248
|
+
) ? result.data[key].length : void 0;
|
|
2249
|
+
debug?.smartMergeField({
|
|
2250
|
+
mergeId: "double_pass_auto_merge",
|
|
2251
|
+
field: key,
|
|
2252
|
+
operation: "merge_arrays",
|
|
2253
|
+
leftCount: leftArray,
|
|
2254
|
+
rightCount: rightArray
|
|
2255
|
+
});
|
|
2256
|
+
if (mergeSpan && telemetry) {
|
|
2257
|
+
telemetry.recordEvent(mergeSpan, {
|
|
2258
|
+
type: "merge",
|
|
2259
|
+
strategy: "smart",
|
|
2260
|
+
inputCount: rightArray ?? 1,
|
|
2261
|
+
outputCount: leftArray ?? 1
|
|
2262
|
+
});
|
|
2263
|
+
}
|
|
2264
|
+
}
|
|
2265
|
+
}
|
|
2266
|
+
debug?.mergeComplete({ mergeId: "double_pass_auto_merge", success: true });
|
|
2267
|
+
if (mergeSpan && telemetry) {
|
|
2268
|
+
telemetry.endSpan(mergeSpan, {
|
|
2269
|
+
status: "ok",
|
|
2270
|
+
output: merged
|
|
2271
|
+
});
|
|
2272
|
+
}
|
|
2273
|
+
merged = dedupeArrays3(merged);
|
|
2274
|
+
const exactDedupeSpan = telemetry?.startSpan({
|
|
2275
|
+
name: "struktur.exact_dedupe",
|
|
2276
|
+
kind: "CHAIN",
|
|
2277
|
+
parentSpan: pass1Span,
|
|
2278
|
+
attributes: {
|
|
2279
|
+
"dedupe.method": "exact_hashing"
|
|
2280
|
+
}
|
|
2281
|
+
});
|
|
2282
|
+
if (exactDedupeSpan && telemetry) {
|
|
2283
|
+
telemetry.recordEvent(exactDedupeSpan, {
|
|
2284
|
+
type: "merge",
|
|
2285
|
+
strategy: "exact_hash_dedupe",
|
|
2286
|
+
inputCount: Object.keys(merged).length,
|
|
2287
|
+
outputCount: Object.keys(merged).length
|
|
2288
|
+
});
|
|
2289
|
+
telemetry.endSpan(exactDedupeSpan, {
|
|
2290
|
+
status: "ok",
|
|
2291
|
+
output: merged
|
|
2292
|
+
});
|
|
2293
|
+
}
|
|
2294
|
+
const dedupePrompt = buildDeduplicationPrompt(schema, merged);
|
|
2295
|
+
debug?.dedupeStart({
|
|
2296
|
+
dedupeId: "double_pass_auto_dedupe",
|
|
2297
|
+
itemCount: Object.keys(merged).length
|
|
2298
|
+
});
|
|
2299
|
+
const llmDedupeSpan = telemetry?.startSpan({
|
|
2300
|
+
name: "struktur.llm_dedupe",
|
|
2301
|
+
kind: "CHAIN",
|
|
2302
|
+
parentSpan: pass1Span,
|
|
2303
|
+
attributes: {
|
|
2304
|
+
"dedupe.method": "llm"
|
|
2305
|
+
}
|
|
2306
|
+
});
|
|
2307
|
+
const dedupeResponse = await runWithRetries({
|
|
2308
|
+
model: this.config.dedupeModel ?? this.config.model,
|
|
2309
|
+
schema: dedupeSchema3,
|
|
2310
|
+
system: dedupePrompt.system,
|
|
2311
|
+
user: dedupePrompt.user,
|
|
2312
|
+
events: options.events,
|
|
2313
|
+
execute: this.config.dedupeExecute,
|
|
2314
|
+
strict: this.config.strict,
|
|
2315
|
+
debug,
|
|
2316
|
+
callId: "double_pass_auto_dedupe",
|
|
2317
|
+
telemetry: telemetry ?? void 0,
|
|
2318
|
+
parentSpan: llmDedupeSpan
|
|
2319
|
+
});
|
|
2320
|
+
step += 1;
|
|
2321
|
+
await options.events?.onStep?.({
|
|
2322
|
+
step,
|
|
2323
|
+
total: totalSteps,
|
|
2324
|
+
label: "pass 1 dedupe"
|
|
2325
|
+
});
|
|
2326
|
+
debug?.step({
|
|
2327
|
+
step,
|
|
2328
|
+
total: totalSteps,
|
|
2329
|
+
label: "pass 1 dedupe",
|
|
2330
|
+
strategy: this.name
|
|
2331
|
+
});
|
|
2332
|
+
let deduped = merged;
|
|
2333
|
+
for (const key of dedupeResponse.data.keys) {
|
|
2334
|
+
deduped = removeByPath3(deduped, key);
|
|
2335
|
+
}
|
|
2336
|
+
debug?.dedupeComplete({
|
|
2337
|
+
dedupeId: "double_pass_auto_dedupe",
|
|
2338
|
+
duplicatesFound: dedupeResponse.data.keys.length,
|
|
2339
|
+
itemsRemoved: dedupeResponse.data.keys.length
|
|
2340
|
+
});
|
|
2341
|
+
if (llmDedupeSpan && telemetry) {
|
|
2342
|
+
telemetry.recordEvent(llmDedupeSpan, {
|
|
2343
|
+
type: "merge",
|
|
2344
|
+
strategy: "llm_dedupe",
|
|
2345
|
+
inputCount: Object.keys(merged).length,
|
|
2346
|
+
outputCount: Object.keys(deduped).length,
|
|
2347
|
+
deduped: dedupeResponse.data.keys.length
|
|
2348
|
+
});
|
|
2349
|
+
telemetry.endSpan(llmDedupeSpan, {
|
|
2350
|
+
status: "ok",
|
|
2351
|
+
output: deduped
|
|
2352
|
+
});
|
|
2353
|
+
}
|
|
2354
|
+
telemetry?.endSpan(pass1Span, {
|
|
2355
|
+
status: "ok",
|
|
2356
|
+
output: deduped
|
|
2357
|
+
});
|
|
2358
|
+
let currentData = deduped;
|
|
2359
|
+
const usages = [...results.map((r) => r.usage), dedupeResponse.usage];
|
|
2360
|
+
const pass2Span = telemetry?.startSpan({
|
|
2361
|
+
name: "struktur.pass_2",
|
|
2362
|
+
kind: "CHAIN",
|
|
2363
|
+
parentSpan: strategySpan,
|
|
2364
|
+
attributes: {
|
|
2365
|
+
"pass.number": 2,
|
|
2366
|
+
"pass.type": "sequential_refinement"
|
|
2367
|
+
}
|
|
2368
|
+
});
|
|
2369
|
+
for (const [index, batch] of batches.entries()) {
|
|
2370
|
+
const prompt = buildSequentialPrompt(
|
|
2371
|
+
batch,
|
|
2372
|
+
schema,
|
|
2373
|
+
JSON.stringify(currentData),
|
|
2374
|
+
this.config.outputInstructions
|
|
2375
|
+
);
|
|
2376
|
+
const result = await extractWithPrompt({
|
|
2377
|
+
model: this.config.model,
|
|
2378
|
+
schema: options.schema,
|
|
2379
|
+
system: prompt.system,
|
|
2380
|
+
user: prompt.user,
|
|
2381
|
+
artifacts: batch,
|
|
2382
|
+
events: options.events,
|
|
2383
|
+
execute: this.config.execute,
|
|
2384
|
+
strict: this.config.strict,
|
|
2385
|
+
debug,
|
|
2386
|
+
callId: `double_pass_auto_2_batch_${index + 1}`,
|
|
2387
|
+
telemetry: telemetry ?? void 0,
|
|
2388
|
+
parentSpan: pass2Span
|
|
2389
|
+
});
|
|
2390
|
+
currentData = result.data;
|
|
2391
|
+
usages.push(result.usage);
|
|
2392
|
+
step += 1;
|
|
2393
|
+
await options.events?.onStep?.({
|
|
2394
|
+
step,
|
|
2395
|
+
total: totalSteps,
|
|
2396
|
+
label: `pass 2 batch ${index + 1}/${batches.length}`
|
|
2397
|
+
});
|
|
2398
|
+
debug?.step({
|
|
2399
|
+
step,
|
|
2400
|
+
total: totalSteps,
|
|
2401
|
+
label: `pass 2 batch ${index + 1}/${batches.length}`,
|
|
2402
|
+
strategy: this.name
|
|
2403
|
+
});
|
|
2404
|
+
}
|
|
2405
|
+
telemetry?.endSpan(pass2Span, {
|
|
2406
|
+
status: "ok",
|
|
2407
|
+
output: currentData
|
|
2408
|
+
});
|
|
2409
|
+
telemetry?.endSpan(strategySpan, {
|
|
2410
|
+
status: "ok",
|
|
2411
|
+
output: currentData
|
|
2412
|
+
});
|
|
2413
|
+
return { data: currentData, usage: mergeUsage(usages) };
|
|
2414
|
+
}
|
|
2415
|
+
};
|
|
2416
|
+
var doublePassAutoMerge = (config) => {
|
|
2417
|
+
return new DoublePassAutoMergeStrategy(config);
|
|
2418
|
+
};
|
|
2419
|
+
|
|
2420
|
+
// src/strategies/agent/AgentStrategy.ts
|
|
2421
|
+
import {
|
|
2422
|
+
createAgentSession,
|
|
2423
|
+
AuthStorage,
|
|
2424
|
+
ModelRegistry,
|
|
2425
|
+
SessionManager,
|
|
2426
|
+
SettingsManager,
|
|
2427
|
+
DefaultResourceLoader
|
|
2428
|
+
} from "@mariozechner/pi-coding-agent";
|
|
2429
|
+
import { Bash as Bash2 } from "just-bash";
|
|
2430
|
+
|
|
2431
|
+
// src/strategies/agent/ArtifactFilesystem.ts
|
|
2432
|
+
var detectImageFormat = (base64) => {
|
|
2433
|
+
if (base64.startsWith("/9j/")) {
|
|
2434
|
+
return "jpg";
|
|
2435
|
+
}
|
|
2436
|
+
if (base64.startsWith("iVBOR")) {
|
|
2437
|
+
return "png";
|
|
2438
|
+
}
|
|
2439
|
+
if (base64.startsWith("R0lGOD")) {
|
|
2440
|
+
return "gif";
|
|
2441
|
+
}
|
|
2442
|
+
if (base64.startsWith("UklGR")) {
|
|
2443
|
+
return "webp";
|
|
2444
|
+
}
|
|
2445
|
+
if (base64.startsWith("Qk")) {
|
|
2446
|
+
return "bmp";
|
|
2447
|
+
}
|
|
2448
|
+
if (base64.startsWith("PHN2Zy") || base64.startsWith("data:image/svg")) {
|
|
2449
|
+
return "svg";
|
|
2450
|
+
}
|
|
2451
|
+
return "bin";
|
|
2452
|
+
};
|
|
2453
|
+
var sanitizeArtifactName = (name) => {
|
|
2454
|
+
return name.replace(/[^a-zA-Z0-9]/g, "-").replace(/-+/g, "-").replace(/^-|-$/g, "").toLowerCase();
|
|
2455
|
+
};
|
|
2456
|
+
var createVirtualFilesystem = (artifacts) => {
|
|
2457
|
+
const virtualFiles = /* @__PURE__ */ new Map();
|
|
2458
|
+
const transformedArtifacts = artifacts.map((artifact) => {
|
|
2459
|
+
const artifactName = sanitizeArtifactName(artifact.id);
|
|
2460
|
+
return {
|
|
2461
|
+
id: artifact.id,
|
|
2462
|
+
type: artifact.type,
|
|
2463
|
+
metadata: artifact.metadata,
|
|
2464
|
+
tokens: artifact.tokens,
|
|
2465
|
+
contents: artifact.contents.map((content, contentIndex) => {
|
|
2466
|
+
const pageNumber = content.page;
|
|
2467
|
+
return {
|
|
2468
|
+
page: content.page,
|
|
2469
|
+
text: content.text,
|
|
2470
|
+
media: content.media?.map((media, mediaIndex) => {
|
|
2471
|
+
if (media.base64 && media.base64.length > 0) {
|
|
2472
|
+
const extension = detectImageFormat(media.base64);
|
|
2473
|
+
let virtualPath;
|
|
2474
|
+
if (pageNumber !== void 0) {
|
|
2475
|
+
virtualPath = `/images/${artifactName}-page-${pageNumber}-image-${mediaIndex}.${extension}`;
|
|
2476
|
+
} else {
|
|
2477
|
+
virtualPath = `/images/${artifactName}-image-${mediaIndex}.${extension}`;
|
|
2478
|
+
}
|
|
2479
|
+
virtualFiles.set(virtualPath, media.base64);
|
|
2480
|
+
return {
|
|
2481
|
+
type: media.type,
|
|
2482
|
+
url: media.url,
|
|
2483
|
+
text: media.text,
|
|
2484
|
+
x: media.x,
|
|
2485
|
+
y: media.y,
|
|
2486
|
+
width: media.width,
|
|
2487
|
+
height: media.height,
|
|
2488
|
+
imageType: media.imageType,
|
|
2489
|
+
virtualPath,
|
|
2490
|
+
originalBase64: `[BASE64: ${media.base64.length} chars]`
|
|
2491
|
+
// For debugging
|
|
2492
|
+
};
|
|
2493
|
+
}
|
|
2494
|
+
return {
|
|
2495
|
+
type: media.type,
|
|
2496
|
+
url: media.url,
|
|
2497
|
+
text: media.text,
|
|
2498
|
+
x: media.x,
|
|
2499
|
+
y: media.y,
|
|
2500
|
+
width: media.width,
|
|
2501
|
+
height: media.height,
|
|
2502
|
+
imageType: media.imageType
|
|
2503
|
+
};
|
|
2504
|
+
})
|
|
2505
|
+
};
|
|
2506
|
+
})
|
|
2507
|
+
};
|
|
2508
|
+
});
|
|
2509
|
+
const totalImagesWithBase64 = Array.from(virtualFiles.keys()).length;
|
|
2510
|
+
const manifest = {
|
|
2511
|
+
count: artifacts.length,
|
|
2512
|
+
artifacts: transformedArtifacts,
|
|
2513
|
+
totalTokens: transformedArtifacts.reduce((sum, a) => sum + (a.tokens || 0), 0),
|
|
2514
|
+
summary: {
|
|
2515
|
+
textArtifacts: artifacts.filter((a) => a.type === "text").length,
|
|
2516
|
+
imageArtifacts: artifacts.filter((a) => a.type === "image").length,
|
|
2517
|
+
pdfArtifacts: artifacts.filter((a) => a.type === "pdf").length,
|
|
2518
|
+
fileArtifacts: artifacts.filter((a) => a.type === "file").length
|
|
2519
|
+
},
|
|
2520
|
+
virtualFiles: {
|
|
2521
|
+
count: totalImagesWithBase64,
|
|
2522
|
+
paths: Array.from(virtualFiles.keys())
|
|
2523
|
+
}
|
|
2524
|
+
};
|
|
2525
|
+
const getImageByPath = (path) => {
|
|
2526
|
+
return virtualFiles.get(path);
|
|
2527
|
+
};
|
|
2528
|
+
const filesystem = {
|
|
2529
|
+
"/artifact.json": JSON.stringify(transformedArtifacts, null, 2),
|
|
2530
|
+
"/manifest.json": JSON.stringify(manifest, null, 2),
|
|
2531
|
+
virtualFiles,
|
|
2532
|
+
getImageByPath
|
|
2533
|
+
};
|
|
2534
|
+
return filesystem;
|
|
2535
|
+
};
|
|
2536
|
+
|
|
2537
|
+
// src/strategies/agent/AgentTools.ts
|
|
2538
|
+
import "just-bash";
|
|
2539
|
+
import { Type } from "@sinclair/typebox";
|
|
2540
|
+
var BashParams = Type.Object({
|
|
2541
|
+
command: Type.String({
|
|
2542
|
+
description: "The bash command to execute"
|
|
2543
|
+
}),
|
|
2544
|
+
timeout: Type.Optional(
|
|
2545
|
+
Type.Number({
|
|
2546
|
+
description: "Timeout in milliseconds (default: 30000)"
|
|
2547
|
+
})
|
|
2548
|
+
)
|
|
2549
|
+
});
|
|
2550
|
+
var ReadParams = Type.Object({
|
|
2551
|
+
file_path: Type.String({
|
|
2552
|
+
description: "The absolute path to the file to read"
|
|
2553
|
+
}),
|
|
2554
|
+
offset: Type.Optional(
|
|
2555
|
+
Type.Number({
|
|
2556
|
+
description: "Line number to start reading from (1-indexed, default: 1)"
|
|
2557
|
+
})
|
|
2558
|
+
),
|
|
2559
|
+
limit: Type.Optional(
|
|
2560
|
+
Type.Number({
|
|
2561
|
+
description: "Maximum number of lines to read"
|
|
2562
|
+
})
|
|
2563
|
+
)
|
|
2564
|
+
});
|
|
2565
|
+
var GrepParams = Type.Object({
|
|
2566
|
+
pattern: Type.String({
|
|
2567
|
+
description: "The search pattern"
|
|
2568
|
+
}),
|
|
2569
|
+
path: Type.String({
|
|
2570
|
+
description: "The file or directory to search in"
|
|
2571
|
+
}),
|
|
2572
|
+
options: Type.Optional(
|
|
2573
|
+
Type.String({
|
|
2574
|
+
description: "Additional grep options (e.g., '-r' for recursive, '-i' for case-insensitive)"
|
|
2575
|
+
})
|
|
2576
|
+
)
|
|
2577
|
+
});
|
|
2578
|
+
var FindParams = Type.Object({
|
|
2579
|
+
path: Type.String({
|
|
2580
|
+
description: "The directory to search in"
|
|
2581
|
+
}),
|
|
2582
|
+
name: Type.Optional(
|
|
2583
|
+
Type.String({
|
|
2584
|
+
description: "Filename pattern to match (e.g., '*.json')"
|
|
2585
|
+
})
|
|
2586
|
+
)
|
|
2587
|
+
});
|
|
2588
|
+
var LsParams = Type.Object({
|
|
2589
|
+
path: Type.String({
|
|
2590
|
+
description: "The directory to list"
|
|
2591
|
+
}),
|
|
2592
|
+
recursive: Type.Optional(
|
|
2593
|
+
Type.Boolean({
|
|
2594
|
+
description: "List recursively"
|
|
2595
|
+
})
|
|
2596
|
+
)
|
|
2597
|
+
});
|
|
2598
|
+
var SetOutputDataParams = Type.Object({
|
|
2599
|
+
data: Type.Any({
|
|
2600
|
+
description: "The output data to set. Can be any shape - will be validated against the schema."
|
|
2601
|
+
})
|
|
2602
|
+
});
|
|
2603
|
+
var UpdateOutputDataParams = Type.Object({
|
|
2604
|
+
changes: Type.Record(Type.String(), Type.Any(), {
|
|
2605
|
+
description: "Changes to merge into the existing output data. Uses deep merge. Missing fields are preserved."
|
|
2606
|
+
})
|
|
2607
|
+
});
|
|
2608
|
+
var ViewImageParams = Type.Object({
|
|
2609
|
+
image_path: Type.String({
|
|
2610
|
+
description: "The absolute path to the image file to view (e.g., '/artifacts/images/artifact-name-page-1-image-0.png')"
|
|
2611
|
+
})
|
|
2612
|
+
});
|
|
2613
|
+
var FinishParams = Type.Object({});
|
|
2614
|
+
var FailParams = Type.Object({
|
|
2615
|
+
reason: Type.String({
|
|
2616
|
+
description: "Explanation of why extraction failed or what data could not be found."
|
|
2617
|
+
})
|
|
2618
|
+
});
|
|
2619
|
+
var createVirtualFilesystemTools = (bash, getImageByPath) => {
|
|
2620
|
+
const bashTool = {
|
|
2621
|
+
name: "bash",
|
|
2622
|
+
label: "Bash",
|
|
2623
|
+
description: "Execute bash commands in the virtual filesystem. Use this to explore artifacts with commands like cat, grep, head, tail, jq, etc.",
|
|
2624
|
+
parameters: BashParams,
|
|
2625
|
+
execute: async (toolCallId, params, signal, onUpdate, ctx) => {
|
|
2626
|
+
try {
|
|
2627
|
+
const result = await bash.exec(params.command);
|
|
2628
|
+
return {
|
|
2629
|
+
content: [
|
|
2630
|
+
{
|
|
2631
|
+
type: "text",
|
|
2632
|
+
text: result.exitCode === 0 ? result.stdout || "(no output)" : `Exit code ${result.exitCode}: ${result.stderr || result.stdout || "(no output)"}`
|
|
2633
|
+
}
|
|
2634
|
+
],
|
|
2635
|
+
details: {
|
|
2636
|
+
exitCode: result.exitCode,
|
|
2637
|
+
...result.stderr && { stderr: result.stderr }
|
|
2638
|
+
}
|
|
2639
|
+
};
|
|
2640
|
+
} catch (error) {
|
|
2641
|
+
const errorMsg = error.message;
|
|
2642
|
+
console.error(`[AgentTools] Bash command error: ${errorMsg}`);
|
|
2643
|
+
console.error(`[AgentTools] Command: ${params.command}`);
|
|
2644
|
+
return {
|
|
2645
|
+
content: [
|
|
2646
|
+
{
|
|
2647
|
+
type: "text",
|
|
2648
|
+
text: `Error: ${errorMsg}`
|
|
2649
|
+
}
|
|
2650
|
+
],
|
|
2651
|
+
details: {
|
|
2652
|
+
error: errorMsg
|
|
2653
|
+
},
|
|
2654
|
+
isError: true
|
|
2655
|
+
};
|
|
2656
|
+
}
|
|
2657
|
+
}
|
|
2658
|
+
};
|
|
2659
|
+
const readTool = {
|
|
2660
|
+
name: "read",
|
|
2661
|
+
label: "Read File",
|
|
2662
|
+
description: "Read the contents of a file from the virtual filesystem. Supports pagination with offset and limit parameters. Can read virtual image files (binary/base64 content).",
|
|
2663
|
+
parameters: ReadParams,
|
|
2664
|
+
execute: async (toolCallId, params, signal, onUpdate, ctx) => {
|
|
2665
|
+
try {
|
|
2666
|
+
if (getImageByPath && params.file_path.startsWith("/images/")) {
|
|
2667
|
+
const imageData = getImageByPath(params.file_path);
|
|
2668
|
+
if (imageData) {
|
|
2669
|
+
const displayData = imageData.length > 1e3 ? imageData.slice(0, 1e3) + "... [truncated]" : imageData;
|
|
2670
|
+
return {
|
|
2671
|
+
content: [
|
|
2672
|
+
{
|
|
2673
|
+
type: "text",
|
|
2674
|
+
text: `[IMAGE FILE: ${params.file_path}]
|
|
2675
|
+
Base64 content (${imageData.length} chars):
|
|
2676
|
+
${displayData}`
|
|
2677
|
+
}
|
|
2678
|
+
],
|
|
2679
|
+
details: {
|
|
2680
|
+
path: params.file_path,
|
|
2681
|
+
size: imageData.length,
|
|
2682
|
+
truncated: imageData.length > 1e3
|
|
2683
|
+
}
|
|
2684
|
+
};
|
|
2685
|
+
}
|
|
2686
|
+
}
|
|
2687
|
+
let command;
|
|
2688
|
+
const offset = params.offset || 1;
|
|
2689
|
+
const limit = params.limit;
|
|
2690
|
+
const endLine = limit ? offset + limit - 1 : void 0;
|
|
2691
|
+
if (limit && endLine) {
|
|
2692
|
+
command = `sed -n '${offset},${endLine}p' "${params.file_path}"`;
|
|
2693
|
+
} else if (offset > 1) {
|
|
2694
|
+
command = `sed -n '${offset},$p' "${params.file_path}"`;
|
|
2695
|
+
} else {
|
|
2696
|
+
command = `cat "${params.file_path}"`;
|
|
2697
|
+
}
|
|
2698
|
+
const result = await bash.exec(command);
|
|
2699
|
+
if (result.exitCode !== 0) {
|
|
2700
|
+
return {
|
|
2701
|
+
content: [
|
|
2702
|
+
{
|
|
2703
|
+
type: "text",
|
|
2704
|
+
text: `Error reading file: ${result.stderr || "File not found"}`
|
|
2705
|
+
}
|
|
2706
|
+
],
|
|
2707
|
+
details: {
|
|
2708
|
+
error: result.stderr || "File not found"
|
|
2709
|
+
},
|
|
2710
|
+
isError: true
|
|
2711
|
+
};
|
|
2712
|
+
}
|
|
2713
|
+
return {
|
|
2714
|
+
content: [
|
|
2715
|
+
{
|
|
2716
|
+
type: "text",
|
|
2717
|
+
text: result.stdout
|
|
2718
|
+
}
|
|
2719
|
+
],
|
|
2720
|
+
details: {
|
|
2721
|
+
lines: result.stdout.split("\n").length,
|
|
2722
|
+
characters: result.stdout.length
|
|
2723
|
+
}
|
|
2724
|
+
};
|
|
2725
|
+
} catch (error) {
|
|
2726
|
+
const errorMsg = error.message;
|
|
2727
|
+
console.error(`[AgentTools] Read error: ${errorMsg}`);
|
|
2728
|
+
console.error(`[AgentTools] File path: ${params.file_path}`);
|
|
2729
|
+
return {
|
|
2730
|
+
content: [
|
|
2731
|
+
{
|
|
2732
|
+
type: "text",
|
|
2733
|
+
text: `Error: ${errorMsg}`
|
|
2734
|
+
}
|
|
2735
|
+
],
|
|
2736
|
+
details: {
|
|
2737
|
+
error: errorMsg
|
|
2738
|
+
},
|
|
2739
|
+
isError: true
|
|
2740
|
+
};
|
|
2741
|
+
}
|
|
2742
|
+
}
|
|
2743
|
+
};
|
|
2744
|
+
const grepTool = {
|
|
2745
|
+
name: "grep",
|
|
2746
|
+
label: "Grep",
|
|
2747
|
+
description: "Search for patterns in files using grep",
|
|
2748
|
+
parameters: GrepParams,
|
|
2749
|
+
execute: async (toolCallId, params, signal, onUpdate, ctx) => {
|
|
2750
|
+
try {
|
|
2751
|
+
const options = params.options || "";
|
|
2752
|
+
const command = `grep ${options} "${params.pattern}" "${params.path}" 2>/dev/null || echo "(no matches found)"`;
|
|
2753
|
+
const result = await bash.exec(command);
|
|
2754
|
+
return {
|
|
2755
|
+
content: [
|
|
2756
|
+
{
|
|
2757
|
+
type: "text",
|
|
2758
|
+
text: result.stdout || "(no matches found)"
|
|
2759
|
+
}
|
|
2760
|
+
],
|
|
2761
|
+
details: {
|
|
2762
|
+
matches: result.stdout.split("\n").filter((line) => line.trim()).length
|
|
2763
|
+
}
|
|
2764
|
+
};
|
|
2765
|
+
} catch (error) {
|
|
2766
|
+
return {
|
|
2767
|
+
content: [
|
|
2768
|
+
{
|
|
2769
|
+
type: "text",
|
|
2770
|
+
text: `(no matches found)`
|
|
2771
|
+
}
|
|
2772
|
+
],
|
|
2773
|
+
details: {}
|
|
2774
|
+
};
|
|
2775
|
+
}
|
|
2776
|
+
}
|
|
2777
|
+
};
|
|
2778
|
+
const findTool = {
|
|
2779
|
+
name: "find",
|
|
2780
|
+
label: "Find",
|
|
2781
|
+
description: "Find files by name or pattern",
|
|
2782
|
+
parameters: FindParams,
|
|
2783
|
+
execute: async (toolCallId, params, signal, onUpdate, ctx) => {
|
|
2784
|
+
try {
|
|
2785
|
+
let command = `find "${params.path}" -type f`;
|
|
2786
|
+
if (params.name) {
|
|
2787
|
+
command = `find "${params.path}" -type f -name "${params.name}"`;
|
|
2788
|
+
}
|
|
2789
|
+
const result = await bash.exec(command);
|
|
2790
|
+
return {
|
|
2791
|
+
content: [
|
|
2792
|
+
{
|
|
2793
|
+
type: "text",
|
|
2794
|
+
text: result.stdout || "(no files found)"
|
|
2795
|
+
}
|
|
2796
|
+
],
|
|
2797
|
+
details: {
|
|
2798
|
+
files: result.stdout.split("\n").filter((line) => line.trim()).length
|
|
2799
|
+
}
|
|
2800
|
+
};
|
|
2801
|
+
} catch (error) {
|
|
2802
|
+
return {
|
|
2803
|
+
content: [
|
|
2804
|
+
{
|
|
2805
|
+
type: "text",
|
|
2806
|
+
text: `Error: ${error.message}`
|
|
2807
|
+
}
|
|
2808
|
+
],
|
|
2809
|
+
details: {
|
|
2810
|
+
error: error.message
|
|
2811
|
+
},
|
|
2812
|
+
isError: true
|
|
2813
|
+
};
|
|
2814
|
+
}
|
|
2815
|
+
}
|
|
2816
|
+
};
|
|
2817
|
+
const lsTool = {
|
|
2818
|
+
name: "ls",
|
|
2819
|
+
label: "List Directory",
|
|
2820
|
+
description: "List files and directories",
|
|
2821
|
+
parameters: LsParams,
|
|
2822
|
+
execute: async (toolCallId, params, signal, onUpdate, ctx) => {
|
|
2823
|
+
try {
|
|
2824
|
+
let command = `ls -la "${params.path}"`;
|
|
2825
|
+
if (params.recursive) {
|
|
2826
|
+
command = `ls -laR "${params.path}"`;
|
|
2827
|
+
}
|
|
2828
|
+
const result = await bash.exec(command);
|
|
2829
|
+
if (result.exitCode !== 0) {
|
|
2830
|
+
return {
|
|
2831
|
+
content: [
|
|
2832
|
+
{
|
|
2833
|
+
type: "text",
|
|
2834
|
+
text: `Error: ${result.stderr || "Directory not found"}`
|
|
2835
|
+
}
|
|
2836
|
+
],
|
|
2837
|
+
details: {
|
|
2838
|
+
error: result.stderr || "Directory not found"
|
|
2839
|
+
},
|
|
2840
|
+
isError: true
|
|
2841
|
+
};
|
|
2842
|
+
}
|
|
2843
|
+
return {
|
|
2844
|
+
content: [
|
|
2845
|
+
{
|
|
2846
|
+
type: "text",
|
|
2847
|
+
text: result.stdout
|
|
2848
|
+
}
|
|
2849
|
+
],
|
|
2850
|
+
details: {
|
|
2851
|
+
entries: result.stdout.split("\n").filter((line) => line.trim() && !line.startsWith("total ")).length
|
|
2852
|
+
}
|
|
2853
|
+
};
|
|
2854
|
+
} catch (error) {
|
|
2855
|
+
return {
|
|
2856
|
+
content: [
|
|
2857
|
+
{
|
|
2858
|
+
type: "text",
|
|
2859
|
+
text: `Error: ${error.message}`
|
|
2860
|
+
}
|
|
2861
|
+
],
|
|
2862
|
+
details: {
|
|
2863
|
+
error: error.message
|
|
2864
|
+
},
|
|
2865
|
+
isError: true
|
|
2866
|
+
};
|
|
2867
|
+
}
|
|
2868
|
+
}
|
|
2869
|
+
};
|
|
2870
|
+
const viewImageTool = {
|
|
2871
|
+
name: "view_image",
|
|
2872
|
+
label: "View Image",
|
|
2873
|
+
description: "View an image file from the virtual filesystem. This injects the image as a visual message that the AI can see and analyze. Use this to examine images, screenshots, diagrams, or any visual content in the artifacts.",
|
|
2874
|
+
parameters: ViewImageParams,
|
|
2875
|
+
execute: async (toolCallId, params, signal, onUpdate, ctx) => {
|
|
2876
|
+
try {
|
|
2877
|
+
if (!getImageByPath) {
|
|
2878
|
+
return {
|
|
2879
|
+
content: [
|
|
2880
|
+
{
|
|
2881
|
+
type: "text",
|
|
2882
|
+
text: "Error: Image viewing is not available"
|
|
2883
|
+
}
|
|
2884
|
+
],
|
|
2885
|
+
details: {
|
|
2886
|
+
error: "Image viewing is not available"
|
|
2887
|
+
},
|
|
2888
|
+
isError: true
|
|
2889
|
+
};
|
|
2890
|
+
}
|
|
2891
|
+
const imageData = getImageByPath(params.image_path);
|
|
2892
|
+
if (!imageData) {
|
|
2893
|
+
return {
|
|
2894
|
+
content: [
|
|
2895
|
+
{
|
|
2896
|
+
type: "text",
|
|
2897
|
+
text: `Error: Image not found at ${params.image_path}`
|
|
2898
|
+
}
|
|
2899
|
+
],
|
|
2900
|
+
details: {
|
|
2901
|
+
error: `Image not found at ${params.image_path}`
|
|
2902
|
+
},
|
|
2903
|
+
isError: true
|
|
2904
|
+
};
|
|
2905
|
+
}
|
|
2906
|
+
const getImageFormat = (path, data) => {
|
|
2907
|
+
if (path.endsWith(".png")) return "image/png";
|
|
2908
|
+
if (path.endsWith(".jpg") || path.endsWith(".jpeg")) return "image/jpeg";
|
|
2909
|
+
if (path.endsWith(".gif")) return "image/gif";
|
|
2910
|
+
if (path.endsWith(".webp")) return "image/webp";
|
|
2911
|
+
if (path.endsWith(".bmp")) return "image/bmp";
|
|
2912
|
+
if (path.endsWith(".svg")) return "image/svg+xml";
|
|
2913
|
+
if (data.startsWith("/9j/")) return "image/jpeg";
|
|
2914
|
+
if (data.startsWith("iVBORw0KGgo")) return "image/png";
|
|
2915
|
+
if (data.startsWith("R0lGOD")) return "image/gif";
|
|
2916
|
+
if (data.startsWith("Qk")) return "image/bmp";
|
|
2917
|
+
if (data.startsWith("PHN2Zy")) return "image/svg+xml";
|
|
2918
|
+
return "image/png";
|
|
2919
|
+
};
|
|
2920
|
+
const mimeType = getImageFormat(params.image_path, imageData);
|
|
2921
|
+
return {
|
|
2922
|
+
content: [
|
|
2923
|
+
{
|
|
2924
|
+
type: "text",
|
|
2925
|
+
text: `[Viewing image: ${params.image_path}]`
|
|
2926
|
+
},
|
|
2927
|
+
{
|
|
2928
|
+
type: "image",
|
|
2929
|
+
data: imageData,
|
|
2930
|
+
mimeType
|
|
2931
|
+
}
|
|
2932
|
+
],
|
|
2933
|
+
details: {
|
|
2934
|
+
path: params.image_path,
|
|
2935
|
+
format: mimeType,
|
|
2936
|
+
size: imageData.length
|
|
2937
|
+
}
|
|
2938
|
+
};
|
|
2939
|
+
} catch (error) {
|
|
2940
|
+
const errorMsg = error.message;
|
|
2941
|
+
console.error(`[AgentTools] View image error: ${errorMsg}`);
|
|
2942
|
+
console.error(`[AgentTools] Image path: ${params.image_path}`);
|
|
2943
|
+
return {
|
|
2944
|
+
content: [
|
|
2945
|
+
{
|
|
2946
|
+
type: "text",
|
|
2947
|
+
text: `Error viewing image: ${errorMsg}`
|
|
2948
|
+
}
|
|
2949
|
+
],
|
|
2950
|
+
details: {
|
|
2951
|
+
error: errorMsg
|
|
2952
|
+
},
|
|
2953
|
+
isError: true
|
|
2954
|
+
};
|
|
2955
|
+
}
|
|
2956
|
+
}
|
|
2957
|
+
};
|
|
2958
|
+
return {
|
|
2959
|
+
bash: bashTool,
|
|
2960
|
+
read: readTool,
|
|
2961
|
+
grep: grepTool,
|
|
2962
|
+
find: findTool,
|
|
2963
|
+
ls: lsTool,
|
|
2964
|
+
view_image: viewImageTool
|
|
2965
|
+
};
|
|
2966
|
+
};
|
|
2967
|
+
|
|
2968
|
+
// src/strategies/agent/AgentStrategy.ts
|
|
2969
|
+
var defaultSystemPrompt = (schema, outputInstructions) => {
|
|
2970
|
+
return `You are an autonomous data extraction agent. Your task is to explore the provided artifacts and extract structured data according to the given JSON schema.
|
|
2971
|
+
|
|
2972
|
+
## Your Environment
|
|
2973
|
+
|
|
2974
|
+
You have access to a virtual filesystem containing the artifacts to extract from:
|
|
2975
|
+
- "/artifact.json" - All artifacts in a structured JSON format (with embedded images replaced by virtual file paths)
|
|
2976
|
+
- "/manifest.json" - Summary and metadata about the artifacts
|
|
2977
|
+
- "/images/" - Virtual directory containing extracted image files (when artifacts have embedded images)
|
|
2978
|
+
|
|
2979
|
+
## Virtual Image Files
|
|
2980
|
+
|
|
2981
|
+
When artifacts contain embedded images (base64-encoded), they are extracted to separate files in "/images/" for easier access:
|
|
2982
|
+
- Image files are named: "/images/{artifact-name}-page-{n}-image-{i}.{ext}"
|
|
2983
|
+
- {artifact-name}: Sanitized artifact ID (lowercase, special chars become dashes)
|
|
2984
|
+
- page-{n}: Page number from the artifact (if available)
|
|
2985
|
+
- image-{i}: Image index within that page
|
|
2986
|
+
- {ext}: File extension determined from base64 (jpg, png, gif, webp, bmp, svg, or bin)
|
|
2987
|
+
- Examples: "/images/invoice-page-3-image-0.jpg" or "/images/report-image-1.png"
|
|
2988
|
+
- Use the "/images/" directory to access image data
|
|
2989
|
+
- The manifest shows which virtual files are available
|
|
2990
|
+
- Image format is shown in the file extension for easy identification
|
|
2991
|
+
|
|
2992
|
+
## IMPORTANT: Do NOT Install Tools
|
|
2993
|
+
|
|
2994
|
+
This is a **sandboxed environment** - you CANNOT install packages or tools:
|
|
2995
|
+
- \u274C DO NOT run: apt-get, pip install, npm install, brew install, etc.
|
|
2996
|
+
- \u274C DO NOT try to install tesseract, ocrmypdf, poppler, or any OCR tools
|
|
2997
|
+
- \u274C DO NOT check if tools exist with "which" or "command -v"
|
|
2998
|
+
- \u2705 ONLY use the provided tools listed below
|
|
2999
|
+
- \u2705 If a tool is missing, work with what you have or report it via fail()
|
|
3000
|
+
|
|
3001
|
+
## Available Tools
|
|
3002
|
+
|
|
3003
|
+
### Exploration Tools
|
|
3004
|
+
- **read** - Read file contents with pagination support (e.g., read {"file_path": "/manifest.json", "limit": 50})
|
|
3005
|
+
- **view_image** - View an image to see its contents visually (e.g., view_image {"image_path": "/images/doc-page-1-image-0.png"})
|
|
3006
|
+
- **bash** - Run shell commands (e.g., bash {"command": "head -20 /artifact.json"})
|
|
3007
|
+
- **grep** - Search for patterns in files
|
|
3008
|
+
- **find** - Find files by name or pattern
|
|
3009
|
+
- **ls** - List files and directories
|
|
3010
|
+
|
|
3011
|
+
### Output Management Tools (IMPORTANT - Use These!)
|
|
3012
|
+
- **set_output_data** - Set the initial extraction output. Call this as soon as you find the first piece of data.
|
|
3013
|
+
- Example: set_output_data({"data": {"company_name": "Acme Corp"}})
|
|
3014
|
+
- The data can be any shape - you'll update it incrementally
|
|
3015
|
+
|
|
3016
|
+
- **update_output_data** - Add or modify fields in the existing output data
|
|
3017
|
+
- Example: update_output_data({"changes": {"address": "123 Main St"}})
|
|
3018
|
+
- This merges new data with existing data (deep merge)
|
|
3019
|
+
- Call this frequently as you discover more information
|
|
3020
|
+
|
|
3021
|
+
- **finish** - Call this when extraction is complete and data validates against the schema
|
|
3022
|
+
- Only works if the data is valid according to the schema
|
|
3023
|
+
- This ends the extraction successfully
|
|
3024
|
+
|
|
3025
|
+
- **fail** - Call this if the schema cannot be satisfied with available data
|
|
3026
|
+
- Provide a reason explaining what data was missing or why extraction failed
|
|
3027
|
+
|
|
3028
|
+
## CRITICAL: Incremental Data Updates
|
|
3029
|
+
|
|
3030
|
+
**You MUST update the output data continuously as you explore!**
|
|
3031
|
+
|
|
3032
|
+
1. **Start immediately**: As soon as you find the first field, call set_output_data
|
|
3033
|
+
2. **Update frequently**: Every time you find new information, call update_output_data
|
|
3034
|
+
3. **Build incrementally**: Don't wait until the end - keep adding data as you go
|
|
3035
|
+
4. **Use all tools**: Combine exploration tools with output tools
|
|
3036
|
+
|
|
3037
|
+
### Example Workflow
|
|
3038
|
+
|
|
3039
|
+
1. Read manifest: read {"file_path": "/manifest.json", "limit": 20}
|
|
3040
|
+
2. Find first data point: grep "company_name" /artifact.json
|
|
3041
|
+
3. **Set initial data**: set_output_data({"data": {"company_name": "Acme Inc"}})
|
|
3042
|
+
4. Continue exploring: read {"file_path": "/artifact.json", "offset": 50, "limit": 30}
|
|
3043
|
+
5. **Update with more data**: update_output_data({"changes": {"address": "123 Main St", "city": "Berlin"}})
|
|
3044
|
+
6. Check images: view_image {"image_path": "/images/doc-page-1-image-0.png"}
|
|
3045
|
+
7. **Update again**: update_output_data({"changes": {"has_logo": true}})
|
|
3046
|
+
8. Verify complete: Check all schema fields are present
|
|
3047
|
+
9. **Finish**: finish()
|
|
3048
|
+
|
|
3049
|
+
## Efficient Exploration Strategy
|
|
3050
|
+
|
|
3051
|
+
**Don't read entire files at once.** Files may be large. Instead:
|
|
3052
|
+
|
|
3053
|
+
1. **Start small**: Read just the first 20-50 lines to understand the structure
|
|
3054
|
+
2. **Navigate selectively**: Use offset and limit to jump to relevant sections
|
|
3055
|
+
3. **Search first**: Use grep to find specific data before reading full content
|
|
3056
|
+
4. **Iterate**: Make multiple small reads rather than one giant read
|
|
3057
|
+
5. **Update as you go**: Call update_output_data immediately when you find data
|
|
3058
|
+
|
|
3059
|
+
### Pagination Examples
|
|
3060
|
+
|
|
3061
|
+
Read first 30 lines:
|
|
3062
|
+
read {"file_path": "/artifact.json", "limit": 30}
|
|
3063
|
+
|
|
3064
|
+
Read lines 31-60 (page 2):
|
|
3065
|
+
read {"file_path": "/artifact.json", "offset": 31, "limit": 30}
|
|
3066
|
+
|
|
3067
|
+
Read from line 100 to end:
|
|
3068
|
+
read {"file_path": "/artifact.json", "offset": 100}
|
|
3069
|
+
|
|
3070
|
+
## Output Rules
|
|
3071
|
+
|
|
3072
|
+
- **Update continuously**: Call update_output_data every time you find new information
|
|
3073
|
+
- **Start early**: Don't wait until the end - set initial data as soon as possible
|
|
3074
|
+
- **Use null for missing values**: If a field can't be found, set it to null
|
|
3075
|
+
- **Never guess**: Only extract information explicitly present in the artifacts
|
|
3076
|
+
- **Validate as you go**: The tools will tell you if your data has validation issues
|
|
3077
|
+
- **Finish properly**: You MUST call finish() to complete extraction successfully
|
|
3078
|
+
- **Fail if needed**: Use fail() if the schema truly cannot be satisfied
|
|
3079
|
+
|
|
3080
|
+
${outputInstructions ? `
|
|
3081
|
+
## Additional Instructions
|
|
3082
|
+
|
|
3083
|
+
${outputInstructions}
|
|
3084
|
+
` : ""}
|
|
3085
|
+
|
|
3086
|
+
## JSON Schema
|
|
3087
|
+
|
|
3088
|
+
${schema}
|
|
3089
|
+
|
|
3090
|
+
## CRITICAL: Tool Calling Format
|
|
3091
|
+
|
|
3092
|
+
When calling tools, you MUST provide the correct parameters:
|
|
3093
|
+
|
|
3094
|
+
**CORRECT - read with file_path:**
|
|
3095
|
+
read {"file_path": "/manifest.json"}
|
|
3096
|
+
|
|
3097
|
+
**CORRECT - read with pagination:**
|
|
3098
|
+
read {"file_path": "/artifact.json", "offset": 1, "limit": 50}
|
|
3099
|
+
|
|
3100
|
+
**CORRECT - view image:**
|
|
3101
|
+
view_image {"image_path": "/images/doc-page-1-image-0.png"}
|
|
3102
|
+
|
|
3103
|
+
**CORRECT - set output data:**
|
|
3104
|
+
set_output_data {"data": {"company_name": "Acme Corp"}}
|
|
3105
|
+
|
|
3106
|
+
**CORRECT - update output:**
|
|
3107
|
+
update_output_data {"changes": {"address": "123 Main St"}}
|
|
3108
|
+
|
|
3109
|
+
**CORRECT - finish:**
|
|
3110
|
+
finish {}
|
|
3111
|
+
|
|
3112
|
+
**CORRECT - fail:**
|
|
3113
|
+
fail {"reason": "Document is not an invoice"}
|
|
3114
|
+
|
|
3115
|
+
## Common Mistakes to AVOID
|
|
3116
|
+
|
|
3117
|
+
\u274C WRONG: read {} (missing file_path)
|
|
3118
|
+
\u274C WRONG: read {file_path: "/path"} (missing quotes around property names)
|
|
3119
|
+
\u274C WRONG: read /path (not using JSON format)
|
|
3120
|
+
\u274C WRONG: set_output_data {company: "Name"} (missing quotes and data wrapper)
|
|
3121
|
+
\u274C WRONG: Trying to install tools with apt-get, pip, npm, etc. (not allowed in sandbox)
|
|
3122
|
+
|
|
3123
|
+
## Remember
|
|
3124
|
+
|
|
3125
|
+
1. **ALWAYS** use set_output_data or update_output_data when you find information
|
|
3126
|
+
2. **ALWAYS** call finish() when done (or fail() if impossible)
|
|
3127
|
+
3. **ALWAYS** provide required parameters when calling tools (file_path for read, data for set_output_data, etc.)
|
|
3128
|
+
4. **NEVER** try to install packages or external tools - work with what you have
|
|
3129
|
+
5. The output tools will validate your data and report issues
|
|
3130
|
+
6. You can update data multiple times - keep refining as you explore
|
|
3131
|
+
7. The CLI shows your progress in real-time as you update the output`;
|
|
3132
|
+
};
|
|
3133
|
+
var AgentStrategy = class {
|
|
3134
|
+
name = "agent";
|
|
3135
|
+
config;
|
|
3136
|
+
constructor(config) {
|
|
3137
|
+
this.config = config;
|
|
3138
|
+
}
|
|
3139
|
+
getEstimatedSteps() {
|
|
3140
|
+
return this.config.maxSteps ?? 50;
|
|
3141
|
+
}
|
|
3142
|
+
async run(options) {
|
|
3143
|
+
const debug = options.debug ?? this.config.debug;
|
|
3144
|
+
const { telemetry } = options;
|
|
3145
|
+
const maxSteps = this.config.maxSteps ?? 50;
|
|
3146
|
+
const agentSpan = telemetry?.startSpan({
|
|
3147
|
+
name: "strategy.agent",
|
|
3148
|
+
kind: "AGENT",
|
|
3149
|
+
attributes: {
|
|
3150
|
+
"strategy.name": this.name,
|
|
3151
|
+
"agent.max_steps": maxSteps,
|
|
3152
|
+
"agent.model": this.config.model ? "custom" : `${this.config.provider}/${this.config.modelId}`,
|
|
3153
|
+
"agent.artifacts.count": options.artifacts.length
|
|
3154
|
+
}
|
|
3155
|
+
});
|
|
3156
|
+
const activeMessageSpans = /* @__PURE__ */ new Map();
|
|
3157
|
+
const activeToolSpans = /* @__PURE__ */ new Map();
|
|
3158
|
+
await options.events?.onStep?.({
|
|
3159
|
+
step: 1,
|
|
3160
|
+
total: this.getEstimatedSteps(),
|
|
3161
|
+
label: "agent_explore"
|
|
3162
|
+
});
|
|
3163
|
+
debug?.step({
|
|
3164
|
+
step: 1,
|
|
3165
|
+
total: this.getEstimatedSteps(),
|
|
3166
|
+
label: "agent_explore",
|
|
3167
|
+
strategy: this.name
|
|
3168
|
+
});
|
|
3169
|
+
const filesystem = createVirtualFilesystem(options.artifacts);
|
|
3170
|
+
const files = {
|
|
3171
|
+
"/artifact.json": filesystem["/artifact.json"],
|
|
3172
|
+
"/manifest.json": filesystem["/manifest.json"]
|
|
3173
|
+
};
|
|
3174
|
+
for (const [path, content] of filesystem.virtualFiles) {
|
|
3175
|
+
files[path] = content;
|
|
3176
|
+
}
|
|
3177
|
+
const bash = new Bash2({
|
|
3178
|
+
files,
|
|
3179
|
+
cwd: "/"
|
|
3180
|
+
});
|
|
3181
|
+
const virtualTools = createVirtualFilesystemTools(bash, filesystem.getImageByPath);
|
|
3182
|
+
const schema = JSON.stringify(options.schema, null, 2);
|
|
3183
|
+
const systemPrompt = this.config.systemPrompt ?? defaultSystemPrompt(schema, this.config.outputInstructions);
|
|
3184
|
+
const callId = `agent_${Date.now()}_${Math.random().toString(36).slice(2, 11)}`;
|
|
3185
|
+
debug?.llmCallStart({
|
|
3186
|
+
callId,
|
|
3187
|
+
model: this.config.model ? JSON.stringify(this.config.model) : "default",
|
|
3188
|
+
schemaName: "extract",
|
|
3189
|
+
systemLength: systemPrompt.length,
|
|
3190
|
+
userLength: 0,
|
|
3191
|
+
artifactCount: options.artifacts.length
|
|
3192
|
+
});
|
|
3193
|
+
debug?.promptSystem({ callId, system: systemPrompt });
|
|
3194
|
+
const startTime = Date.now();
|
|
3195
|
+
const agentDir = this.config.agentDir;
|
|
3196
|
+
const authStorage = agentDir ? AuthStorage.create(`${agentDir}/auth.json`) : AuthStorage.create();
|
|
3197
|
+
if (this.config.apiKey && this.config.provider) {
|
|
3198
|
+
authStorage.setRuntimeApiKey(this.config.provider, this.config.apiKey);
|
|
3199
|
+
}
|
|
3200
|
+
const modelRegistry = new ModelRegistry(authStorage);
|
|
3201
|
+
let model = this.config.model;
|
|
3202
|
+
if (!model && this.config.provider && this.config.modelId) {
|
|
3203
|
+
if (this.config.verbose) {
|
|
3204
|
+
console.error(`[AgentStrategy] Looking up model: ${this.config.provider}/${this.config.modelId}`);
|
|
3205
|
+
}
|
|
3206
|
+
model = modelRegistry.find(this.config.provider, this.config.modelId);
|
|
3207
|
+
if (this.config.verbose) {
|
|
3208
|
+
console.error(`[AgentStrategy] Model resolved: ${model ? "success" : "failed"}`);
|
|
3209
|
+
if (model) {
|
|
3210
|
+
console.error(`[AgentStrategy] Model info:`, JSON.stringify(model).slice(0, 200));
|
|
3211
|
+
}
|
|
3212
|
+
}
|
|
3213
|
+
} else if (model && this.config.verbose) {
|
|
3214
|
+
console.error(`[AgentStrategy] Using pre-configured model`);
|
|
3215
|
+
}
|
|
3216
|
+
const loader = new DefaultResourceLoader({
|
|
3217
|
+
cwd: "/artifacts",
|
|
3218
|
+
agentDir: agentDir || void 0,
|
|
3219
|
+
systemPromptOverride: () => systemPrompt
|
|
3220
|
+
});
|
|
3221
|
+
await loader.reload();
|
|
3222
|
+
const settingsManager = SettingsManager.inMemory({
|
|
3223
|
+
compaction: { enabled: false }
|
|
3224
|
+
// Disable compaction for extraction tasks
|
|
3225
|
+
});
|
|
3226
|
+
let currentOutput = null;
|
|
3227
|
+
let isFinished = false;
|
|
3228
|
+
let finishError = null;
|
|
3229
|
+
let extractionFailed = false;
|
|
3230
|
+
let failureReason = null;
|
|
3231
|
+
const validateData = (data) => {
|
|
3232
|
+
try {
|
|
3233
|
+
JSON.stringify(data);
|
|
3234
|
+
return { valid: true, errors: [] };
|
|
3235
|
+
} catch (e) {
|
|
3236
|
+
return { valid: false, errors: [e.message] };
|
|
3237
|
+
}
|
|
3238
|
+
};
|
|
3239
|
+
const deepMerge = (target, source) => {
|
|
3240
|
+
const output = Object.assign({}, target);
|
|
3241
|
+
if (isObject(target) && isObject(source)) {
|
|
3242
|
+
Object.keys(source).forEach((key) => {
|
|
3243
|
+
if (isObject(source[key])) {
|
|
3244
|
+
if (!(key in target)) {
|
|
3245
|
+
Object.assign(output, { [key]: source[key] });
|
|
3246
|
+
} else {
|
|
3247
|
+
output[key] = deepMerge(target[key], source[key]);
|
|
3248
|
+
}
|
|
3249
|
+
} else {
|
|
3250
|
+
Object.assign(output, { [key]: source[key] });
|
|
3251
|
+
}
|
|
3252
|
+
});
|
|
3253
|
+
}
|
|
3254
|
+
return output;
|
|
3255
|
+
};
|
|
3256
|
+
const isObject = (item) => {
|
|
3257
|
+
return item && typeof item === "object" && !Array.isArray(item);
|
|
3258
|
+
};
|
|
3259
|
+
await options.events?.onStep?.({
|
|
3260
|
+
step: 2,
|
|
3261
|
+
total: this.getEstimatedSteps(),
|
|
3262
|
+
label: "agent_init"
|
|
3263
|
+
});
|
|
3264
|
+
debug?.step({
|
|
3265
|
+
step: 2,
|
|
3266
|
+
total: this.getEstimatedSteps(),
|
|
3267
|
+
label: "agent_init",
|
|
3268
|
+
strategy: this.name
|
|
3269
|
+
});
|
|
3270
|
+
const { Type: Type2 } = await import("@sinclair/typebox");
|
|
3271
|
+
const setOutputDataTool = {
|
|
3272
|
+
name: "set_output_data",
|
|
3273
|
+
label: "Set Output Data",
|
|
3274
|
+
description: "Set the initial output data. You can use any structure - it will be validated against the schema.",
|
|
3275
|
+
parameters: Type2.Object({
|
|
3276
|
+
data: Type2.Any({ description: "The output data to set" })
|
|
3277
|
+
}),
|
|
3278
|
+
execute: async (toolCallId, params) => {
|
|
3279
|
+
currentOutput = params.data;
|
|
3280
|
+
const validation = validateData(params.data);
|
|
3281
|
+
const status = validation.valid ? "\u2713 Valid structure" : `\u2717 Validation issues: ${validation.errors.join(", ")}`;
|
|
3282
|
+
await options.events?.onStep?.({
|
|
3283
|
+
step: stepCount + 1,
|
|
3284
|
+
total: this.getEstimatedSteps(),
|
|
3285
|
+
label: `Output: ${JSON.stringify(params.data).slice(0, 50)}...`
|
|
3286
|
+
});
|
|
3287
|
+
return {
|
|
3288
|
+
content: [{ type: "text", text: `Output data set. ${status}` }],
|
|
3289
|
+
details: { validation }
|
|
3290
|
+
};
|
|
3291
|
+
}
|
|
3292
|
+
};
|
|
3293
|
+
const updateOutputDataTool = {
|
|
3294
|
+
name: "update_output_data",
|
|
3295
|
+
label: "Update Output Data",
|
|
3296
|
+
description: "Update the output data by merging changes. Existing fields are preserved, new fields are added.",
|
|
3297
|
+
parameters: Type2.Object({
|
|
3298
|
+
changes: Type2.Record(Type2.String(), Type2.Any(), {
|
|
3299
|
+
description: "Changes to merge into existing data"
|
|
3300
|
+
})
|
|
3301
|
+
}),
|
|
3302
|
+
execute: async (toolCallId, params) => {
|
|
3303
|
+
if (currentOutput === null) {
|
|
3304
|
+
return {
|
|
3305
|
+
content: [{ type: "text", text: "Error: No output data set yet. Use set_output_data first." }],
|
|
3306
|
+
isError: true
|
|
3307
|
+
};
|
|
3308
|
+
}
|
|
3309
|
+
currentOutput = deepMerge(currentOutput, params.changes);
|
|
3310
|
+
const validation = validateData(currentOutput);
|
|
3311
|
+
const status = validation.valid ? "\u2713 Valid structure" : `\u2717 Validation issues: ${validation.errors.join(", ")}`;
|
|
3312
|
+
await options.events?.onStep?.({
|
|
3313
|
+
step: stepCount + 1,
|
|
3314
|
+
total: this.getEstimatedSteps(),
|
|
3315
|
+
label: `Updated: ${JSON.stringify(params.changes).slice(0, 50)}...`
|
|
3316
|
+
});
|
|
3317
|
+
return {
|
|
3318
|
+
content: [{ type: "text", text: `Output data updated. ${status}` }],
|
|
3319
|
+
details: { validation, currentOutput }
|
|
3320
|
+
};
|
|
3321
|
+
}
|
|
3322
|
+
};
|
|
3323
|
+
const finishTool = {
|
|
3324
|
+
name: "finish",
|
|
3325
|
+
label: "Finish Extraction",
|
|
3326
|
+
description: "Complete the extraction. Can only be called when data validates against the schema.",
|
|
3327
|
+
parameters: Type2.Object({}),
|
|
3328
|
+
execute: async (toolCallId) => {
|
|
3329
|
+
if (extractionFailed) {
|
|
3330
|
+
return {
|
|
3331
|
+
content: [{ type: "text", text: "Cannot finish - extraction was marked as failed." }],
|
|
3332
|
+
isError: true
|
|
3333
|
+
};
|
|
3334
|
+
}
|
|
3335
|
+
if (currentOutput === null) {
|
|
3336
|
+
return {
|
|
3337
|
+
content: [{ type: "text", text: "Error: No output data set. Extract data first." }],
|
|
3338
|
+
isError: true
|
|
3339
|
+
};
|
|
3340
|
+
}
|
|
3341
|
+
const validation = validateData(currentOutput);
|
|
3342
|
+
if (!validation.valid) {
|
|
3343
|
+
finishError = `Schema validation failed: ${validation.errors.join(", ")}`;
|
|
3344
|
+
return {
|
|
3345
|
+
content: [{
|
|
3346
|
+
type: "text",
|
|
3347
|
+
text: `Cannot finish: ${finishError}
|
|
3348
|
+
|
|
3349
|
+
Fix the data and try again, or use fail() if extraction is impossible.`
|
|
3350
|
+
}],
|
|
3351
|
+
isError: true
|
|
3352
|
+
};
|
|
3353
|
+
}
|
|
3354
|
+
isFinished = true;
|
|
3355
|
+
return {
|
|
3356
|
+
content: [{ type: "text", text: "\u2713 Extraction complete! Data validated successfully." }]
|
|
3357
|
+
};
|
|
3358
|
+
}
|
|
3359
|
+
};
|
|
3360
|
+
const failTool = {
|
|
3361
|
+
name: "fail",
|
|
3362
|
+
label: "Fail Extraction",
|
|
3363
|
+
description: "Mark extraction as failed when the schema cannot be satisfied with the available data.",
|
|
3364
|
+
parameters: Type2.Object({
|
|
3365
|
+
reason: Type2.String({ description: "Why extraction failed or what data was missing" })
|
|
3366
|
+
}),
|
|
3367
|
+
execute: async (toolCallId, params) => {
|
|
3368
|
+
extractionFailed = true;
|
|
3369
|
+
failureReason = params.reason;
|
|
3370
|
+
return {
|
|
3371
|
+
content: [{ type: "text", text: `Extraction marked as failed: ${params.reason}` }]
|
|
3372
|
+
};
|
|
3373
|
+
}
|
|
3374
|
+
};
|
|
3375
|
+
const allTools = [
|
|
3376
|
+
virtualTools.read,
|
|
3377
|
+
virtualTools.bash,
|
|
3378
|
+
virtualTools.grep,
|
|
3379
|
+
virtualTools.find,
|
|
3380
|
+
virtualTools.ls,
|
|
3381
|
+
setOutputDataTool,
|
|
3382
|
+
updateOutputDataTool,
|
|
3383
|
+
finishTool,
|
|
3384
|
+
failTool
|
|
3385
|
+
];
|
|
3386
|
+
if (this.config.verbose) {
|
|
3387
|
+
console.error(`[AgentStrategy] Creating session with ${allTools.length} tools`);
|
|
3388
|
+
console.error(`[AgentStrategy] Tool names: ${allTools.map((t) => t.name).join(", ")}`);
|
|
3389
|
+
allTools.forEach((tool) => {
|
|
3390
|
+
console.error(`[AgentStrategy] Tool "${tool.name}" details:`);
|
|
3391
|
+
console.error(` - label: ${tool.label}`);
|
|
3392
|
+
console.error(` - description: ${tool.description?.slice(0, 100)}...`);
|
|
3393
|
+
console.error(` - has parameters: ${!!tool.parameters}`);
|
|
3394
|
+
if (tool.parameters) {
|
|
3395
|
+
console.error(` - parameters type: ${tool.parameters.type}`);
|
|
3396
|
+
console.error(` - required fields: ${JSON.stringify(tool.parameters.required)}`);
|
|
3397
|
+
}
|
|
3398
|
+
});
|
|
3399
|
+
}
|
|
3400
|
+
const { session } = await createAgentSession({
|
|
3401
|
+
model,
|
|
3402
|
+
authStorage,
|
|
3403
|
+
modelRegistry,
|
|
3404
|
+
resourceLoader: loader,
|
|
3405
|
+
sessionManager: SessionManager.inMemory(),
|
|
3406
|
+
settingsManager,
|
|
3407
|
+
tools: [],
|
|
3408
|
+
// No default tools
|
|
3409
|
+
customTools: allTools
|
|
3410
|
+
});
|
|
3411
|
+
if (this.config.verbose) {
|
|
3412
|
+
console.error(`[AgentStrategy] Session created successfully`);
|
|
3413
|
+
}
|
|
3414
|
+
await options.events?.onStep?.({
|
|
3415
|
+
step: 3,
|
|
3416
|
+
total: this.getEstimatedSteps(),
|
|
3417
|
+
label: "agent_session_ready"
|
|
3418
|
+
});
|
|
3419
|
+
debug?.step({
|
|
3420
|
+
step: 3,
|
|
3421
|
+
total: this.getEstimatedSteps(),
|
|
3422
|
+
label: "agent_session_ready",
|
|
3423
|
+
strategy: this.name
|
|
3424
|
+
});
|
|
3425
|
+
let usage = { inputTokens: 0, outputTokens: 0, totalTokens: 0 };
|
|
3426
|
+
let stepCount = 0;
|
|
3427
|
+
let finalResponse = "";
|
|
3428
|
+
const maxToolCalls = maxSteps;
|
|
3429
|
+
let textBuffer = "";
|
|
3430
|
+
try {
|
|
3431
|
+
const unsubscribe = session.subscribe((event) => {
|
|
3432
|
+
try {
|
|
3433
|
+
switch (event.type) {
|
|
3434
|
+
case "message_update": {
|
|
3435
|
+
if (event.assistantMessageEvent.type === "text_delta") {
|
|
3436
|
+
const delta = event.assistantMessageEvent.delta;
|
|
3437
|
+
finalResponse += delta;
|
|
3438
|
+
options.events?.onAgentMessage?.({
|
|
3439
|
+
content: delta,
|
|
3440
|
+
role: "assistant"
|
|
3441
|
+
});
|
|
3442
|
+
textBuffer += delta;
|
|
3443
|
+
let newlineIndex;
|
|
3444
|
+
while ((newlineIndex = textBuffer.indexOf("\n")) !== -1) {
|
|
3445
|
+
const line = textBuffer.slice(0, newlineIndex).trim();
|
|
3446
|
+
textBuffer = textBuffer.slice(newlineIndex + 1);
|
|
3447
|
+
if (line.length > 0) {
|
|
3448
|
+
options.events?.onStep?.({
|
|
3449
|
+
step: stepCount,
|
|
3450
|
+
total: this.getEstimatedSteps(),
|
|
3451
|
+
label: `\u2192 ${line.slice(0, 120)}`
|
|
3452
|
+
});
|
|
3453
|
+
}
|
|
3454
|
+
}
|
|
3455
|
+
if (textBuffer.length > 100) {
|
|
3456
|
+
const line = textBuffer.trim();
|
|
3457
|
+
if (line.length > 0) {
|
|
3458
|
+
options.events?.onStep?.({
|
|
3459
|
+
step: stepCount,
|
|
3460
|
+
total: this.getEstimatedSteps(),
|
|
3461
|
+
label: `\u2192 ${line.slice(0, 120)}`
|
|
3462
|
+
});
|
|
3463
|
+
}
|
|
3464
|
+
textBuffer = "";
|
|
3465
|
+
}
|
|
3466
|
+
}
|
|
3467
|
+
break;
|
|
3468
|
+
}
|
|
3469
|
+
case "tool_execution_start": {
|
|
3470
|
+
stepCount++;
|
|
3471
|
+
if (telemetry && agentSpan) {
|
|
3472
|
+
const toolSpan = telemetry.startSpan({
|
|
3473
|
+
name: `agent.tool.${event.toolName}`,
|
|
3474
|
+
kind: "TOOL",
|
|
3475
|
+
parentSpan: agentSpan,
|
|
3476
|
+
attributes: {
|
|
3477
|
+
"tool.name": event.toolName,
|
|
3478
|
+
"tool.call_id": event.toolCallId,
|
|
3479
|
+
"tool.args": JSON.stringify(event.args || {})
|
|
3480
|
+
}
|
|
3481
|
+
});
|
|
3482
|
+
activeToolSpans.set(event.toolCallId, toolSpan);
|
|
3483
|
+
}
|
|
3484
|
+
let label;
|
|
3485
|
+
let detail;
|
|
3486
|
+
const args = event.args;
|
|
3487
|
+
const toolName = event.toolName;
|
|
3488
|
+
const toolStartEvent = {
|
|
3489
|
+
toolName,
|
|
3490
|
+
toolCallId: event.toolCallId,
|
|
3491
|
+
args: args || {}
|
|
3492
|
+
};
|
|
3493
|
+
options.events?.onAgentToolStart?.(toolStartEvent);
|
|
3494
|
+
if (toolName === "read" && args?.file_path) {
|
|
3495
|
+
const fileName = args.file_path.split("/").pop() || args.file_path;
|
|
3496
|
+
const pagination = [];
|
|
3497
|
+
if (args.offset && args.offset > 1) {
|
|
3498
|
+
pagination.push(`offset ${args.offset}`);
|
|
3499
|
+
}
|
|
3500
|
+
if (args.limit) {
|
|
3501
|
+
pagination.push(`limit ${args.limit}`);
|
|
3502
|
+
}
|
|
3503
|
+
const paginationStr = pagination.length > 0 ? ` (${pagination.join(", ")})` : "";
|
|
3504
|
+
label = `Read ${fileName}${paginationStr}`;
|
|
3505
|
+
detail = "";
|
|
3506
|
+
} else if (toolName === "bash" && args?.command) {
|
|
3507
|
+
const cmd = args.command.length > 40 ? args.command.slice(0, 37) + "..." : args.command;
|
|
3508
|
+
label = `Bash: ${cmd}`;
|
|
3509
|
+
detail = "";
|
|
3510
|
+
} else if (toolName === "grep" && args?.pattern) {
|
|
3511
|
+
label = `Grep "${args.pattern}"`;
|
|
3512
|
+
detail = args.path ? `in ${args.path.split("/").pop()}` : "";
|
|
3513
|
+
} else if (toolName === "find" && args?.path) {
|
|
3514
|
+
label = `Find`;
|
|
3515
|
+
detail = args.name ? `"${args.name}" in ${args.path}` : `in ${args.path}`;
|
|
3516
|
+
} else if (toolName === "ls" && args?.path) {
|
|
3517
|
+
label = `List ${args.path}`;
|
|
3518
|
+
detail = args.recursive ? "recursive" : "";
|
|
3519
|
+
} else if (toolName === "set_output_data") {
|
|
3520
|
+
label = "Set Output";
|
|
3521
|
+
detail = args?.data ? JSON.stringify(args.data).slice(0, 80) : "";
|
|
3522
|
+
} else if (toolName === "update_output_data") {
|
|
3523
|
+
label = "Update Output";
|
|
3524
|
+
detail = args?.changes ? JSON.stringify(args.changes).slice(0, 80) : "";
|
|
3525
|
+
} else if (toolName === "finish") {
|
|
3526
|
+
label = "Finish";
|
|
3527
|
+
detail = "";
|
|
3528
|
+
} else if (toolName === "fail") {
|
|
3529
|
+
label = "Fail";
|
|
3530
|
+
detail = args?.reason || "";
|
|
3531
|
+
} else {
|
|
3532
|
+
label = toolName;
|
|
3533
|
+
detail = args ? JSON.stringify(args).slice(0, 100) : "";
|
|
3534
|
+
}
|
|
3535
|
+
options.events?.onStep?.({
|
|
3536
|
+
step: stepCount + 1,
|
|
3537
|
+
total: this.getEstimatedSteps(),
|
|
3538
|
+
label,
|
|
3539
|
+
detail
|
|
3540
|
+
});
|
|
3541
|
+
debug?.step({
|
|
3542
|
+
step: stepCount + 1,
|
|
3543
|
+
total: this.getEstimatedSteps(),
|
|
3544
|
+
label,
|
|
3545
|
+
strategy: this.name
|
|
3546
|
+
});
|
|
3547
|
+
break;
|
|
3548
|
+
}
|
|
3549
|
+
case "tool_execution_end": {
|
|
3550
|
+
const toolEndEvent = event;
|
|
3551
|
+
const toolSpan = activeToolSpans.get(toolEndEvent.toolCallId);
|
|
3552
|
+
if (toolSpan && telemetry) {
|
|
3553
|
+
const hasError = toolEndEvent.isError || toolEndEvent.error;
|
|
3554
|
+
telemetry.endSpan(toolSpan, {
|
|
3555
|
+
status: hasError ? "error" : "ok",
|
|
3556
|
+
error: hasError ? new Error(toolEndEvent.error || "Tool execution failed") : void 0,
|
|
3557
|
+
output: toolEndEvent.result
|
|
3558
|
+
});
|
|
3559
|
+
activeToolSpans.delete(toolEndEvent.toolCallId);
|
|
3560
|
+
}
|
|
3561
|
+
options.events?.onAgentToolEnd?.({
|
|
3562
|
+
toolCallId: toolEndEvent.toolCallId,
|
|
3563
|
+
result: toolEndEvent.result,
|
|
3564
|
+
error: toolEndEvent.error || toolEndEvent.isError ? toolEndEvent.error || "Tool execution failed" : void 0
|
|
3565
|
+
});
|
|
3566
|
+
if (toolEndEvent.isError || toolEndEvent.error) {
|
|
3567
|
+
const errorMsg = toolEndEvent.error || "Unknown tool error";
|
|
3568
|
+
const toolName = toolEndEvent.toolName || "unknown";
|
|
3569
|
+
const toolCallId = toolEndEvent.toolCallId || "unknown";
|
|
3570
|
+
if (this.config.verbose) {
|
|
3571
|
+
console.error(`[AgentStrategy] Tool execution failed: ${errorMsg}`);
|
|
3572
|
+
console.error(`[AgentStrategy] Tool: ${toolName}, Call ID: ${toolCallId}`);
|
|
3573
|
+
if (toolEndEvent.result) {
|
|
3574
|
+
console.error(`[AgentStrategy] Result:`, JSON.stringify(toolEndEvent.result));
|
|
3575
|
+
}
|
|
3576
|
+
}
|
|
3577
|
+
}
|
|
3578
|
+
break;
|
|
3579
|
+
}
|
|
3580
|
+
case "agent_end": {
|
|
3581
|
+
if (textBuffer.trim().length > 0) {
|
|
3582
|
+
options.events?.onStep?.({
|
|
3583
|
+
step: stepCount,
|
|
3584
|
+
total: this.getEstimatedSteps(),
|
|
3585
|
+
label: `\u2192 ${textBuffer.trim().slice(0, 120)}`
|
|
3586
|
+
});
|
|
3587
|
+
textBuffer = "";
|
|
3588
|
+
}
|
|
3589
|
+
if (event.messages && event.messages.length > 0) {
|
|
3590
|
+
const inputTokens = event.messages.reduce((sum, msg) => {
|
|
3591
|
+
if (msg.role === "user") {
|
|
3592
|
+
return sum + Math.ceil(JSON.stringify(msg.content).length / 4);
|
|
3593
|
+
}
|
|
3594
|
+
return sum;
|
|
3595
|
+
}, 0);
|
|
3596
|
+
const outputTokens = event.messages.reduce((sum, msg) => {
|
|
3597
|
+
if (msg.role === "assistant") {
|
|
3598
|
+
return sum + Math.ceil(JSON.stringify(msg.content).length / 4);
|
|
3599
|
+
}
|
|
3600
|
+
return sum;
|
|
3601
|
+
}, 0);
|
|
3602
|
+
usage = {
|
|
3603
|
+
inputTokens,
|
|
3604
|
+
outputTokens,
|
|
3605
|
+
totalTokens: inputTokens + outputTokens
|
|
3606
|
+
};
|
|
3607
|
+
}
|
|
3608
|
+
options.events?.onStep?.({
|
|
3609
|
+
step: stepCount + 1,
|
|
3610
|
+
total: this.getEstimatedSteps(),
|
|
3611
|
+
label: "agent_complete"
|
|
3612
|
+
});
|
|
3613
|
+
debug?.step({
|
|
3614
|
+
step: stepCount + 1,
|
|
3615
|
+
total: this.getEstimatedSteps(),
|
|
3616
|
+
label: "agent_complete",
|
|
3617
|
+
strategy: this.name
|
|
3618
|
+
});
|
|
3619
|
+
break;
|
|
3620
|
+
}
|
|
3621
|
+
case "message_start": {
|
|
3622
|
+
const messageKey = `msg_${Date.now()}_${Math.random().toString(36).slice(2, 11)}`;
|
|
3623
|
+
if (telemetry && agentSpan) {
|
|
3624
|
+
const llmSpan = telemetry.startSpan({
|
|
3625
|
+
name: "agent.llm.generate",
|
|
3626
|
+
kind: "LLM",
|
|
3627
|
+
parentSpan: agentSpan,
|
|
3628
|
+
attributes: {
|
|
3629
|
+
"llm.message_type": event.message?.role || "unknown",
|
|
3630
|
+
"llm.type": "agent_message"
|
|
3631
|
+
}
|
|
3632
|
+
});
|
|
3633
|
+
activeMessageSpans.set(messageKey, llmSpan);
|
|
3634
|
+
event._telemetryKey = messageKey;
|
|
3635
|
+
}
|
|
3636
|
+
break;
|
|
3637
|
+
}
|
|
3638
|
+
case "message_end": {
|
|
3639
|
+
const messageKey = event._telemetryKey;
|
|
3640
|
+
if (messageKey) {
|
|
3641
|
+
const llmSpan = activeMessageSpans.get(messageKey);
|
|
3642
|
+
if (llmSpan && telemetry) {
|
|
3643
|
+
telemetry.endSpan(llmSpan, {
|
|
3644
|
+
status: "ok",
|
|
3645
|
+
output: finalResponse.slice(-200)
|
|
3646
|
+
// Last 200 chars as output preview
|
|
3647
|
+
});
|
|
3648
|
+
activeMessageSpans.delete(messageKey);
|
|
3649
|
+
}
|
|
3650
|
+
}
|
|
3651
|
+
break;
|
|
3652
|
+
}
|
|
3653
|
+
case "agent_start": {
|
|
3654
|
+
if (this.config.verbose) {
|
|
3655
|
+
console.error("[AgentStrategy] Agent started processing");
|
|
3656
|
+
}
|
|
3657
|
+
break;
|
|
3658
|
+
}
|
|
3659
|
+
case "turn_start":
|
|
3660
|
+
case "turn_end": {
|
|
3661
|
+
break;
|
|
3662
|
+
}
|
|
3663
|
+
case "auto_compaction_start":
|
|
3664
|
+
case "auto_compaction_end":
|
|
3665
|
+
case "auto_retry_start":
|
|
3666
|
+
case "auto_retry_end": {
|
|
3667
|
+
break;
|
|
3668
|
+
}
|
|
3669
|
+
default: {
|
|
3670
|
+
const unhandledEvent = event;
|
|
3671
|
+
if (unhandledEvent.type && !unhandledEvent.type.includes("_")) {
|
|
3672
|
+
if (this.config.verbose) {
|
|
3673
|
+
console.error(`[AgentStrategy] Unexpected event type: ${unhandledEvent.type}`);
|
|
3674
|
+
}
|
|
3675
|
+
}
|
|
3676
|
+
break;
|
|
3677
|
+
}
|
|
3678
|
+
}
|
|
3679
|
+
} catch (eventHandlerError) {
|
|
3680
|
+
if (this.config.verbose) {
|
|
3681
|
+
console.error(`[AgentStrategy] Error in event handler: ${eventHandlerError.message}`);
|
|
3682
|
+
console.error("AgentStrategy event handler error:", eventHandlerError);
|
|
3683
|
+
}
|
|
3684
|
+
throw eventHandlerError;
|
|
3685
|
+
}
|
|
3686
|
+
});
|
|
3687
|
+
await session.prompt(
|
|
3688
|
+
"Begin exploring the artifacts and extract the required data according to the schema. Start by reading the manifest file.",
|
|
3689
|
+
{
|
|
3690
|
+
// Ensure the agent keeps running until it calls finish() or fail()
|
|
3691
|
+
}
|
|
3692
|
+
);
|
|
3693
|
+
if (currentOutput === null && !extractionFailed && !isFinished) {
|
|
3694
|
+
if (this.config.verbose) {
|
|
3695
|
+
console.error("[AgentStrategy] No output after first run. Sending retry prompt...");
|
|
3696
|
+
}
|
|
3697
|
+
await options.events?.onStep?.({
|
|
3698
|
+
step: stepCount + 1,
|
|
3699
|
+
total: this.getEstimatedSteps(),
|
|
3700
|
+
label: "Retry: forcing output extraction"
|
|
3701
|
+
});
|
|
3702
|
+
const retryUnsubscribe = session.subscribe((event) => {
|
|
3703
|
+
try {
|
|
3704
|
+
switch (event.type) {
|
|
3705
|
+
case "message_update": {
|
|
3706
|
+
if (event.assistantMessageEvent.type === "text_delta") {
|
|
3707
|
+
const delta = event.assistantMessageEvent.delta;
|
|
3708
|
+
finalResponse += delta;
|
|
3709
|
+
options.events?.onAgentMessage?.({
|
|
3710
|
+
content: delta,
|
|
3711
|
+
role: "assistant"
|
|
3712
|
+
});
|
|
3713
|
+
textBuffer += delta;
|
|
3714
|
+
let newlineIndex;
|
|
3715
|
+
while ((newlineIndex = textBuffer.indexOf("\n")) !== -1) {
|
|
3716
|
+
const line = textBuffer.slice(0, newlineIndex).trim();
|
|
3717
|
+
textBuffer = textBuffer.slice(newlineIndex + 1);
|
|
3718
|
+
if (line.length > 0) {
|
|
3719
|
+
options.events?.onStep?.({
|
|
3720
|
+
step: stepCount,
|
|
3721
|
+
total: this.getEstimatedSteps(),
|
|
3722
|
+
label: `\u2192 ${line.slice(0, 120)}`
|
|
3723
|
+
});
|
|
3724
|
+
}
|
|
3725
|
+
}
|
|
3726
|
+
if (textBuffer.length > 100) {
|
|
3727
|
+
const line = textBuffer.trim();
|
|
3728
|
+
if (line.length > 0) {
|
|
3729
|
+
options.events?.onStep?.({
|
|
3730
|
+
step: stepCount,
|
|
3731
|
+
total: this.getEstimatedSteps(),
|
|
3732
|
+
label: `\u2192 ${line.slice(0, 120)}`
|
|
3733
|
+
});
|
|
3734
|
+
}
|
|
3735
|
+
textBuffer = "";
|
|
3736
|
+
}
|
|
3737
|
+
}
|
|
3738
|
+
break;
|
|
3739
|
+
}
|
|
3740
|
+
case "tool_execution_start": {
|
|
3741
|
+
stepCount++;
|
|
3742
|
+
const toolName = event.toolName;
|
|
3743
|
+
const args = event.args;
|
|
3744
|
+
let label = toolName;
|
|
3745
|
+
if (toolName === "set_output_data") {
|
|
3746
|
+
label = "Set Output (retry)";
|
|
3747
|
+
} else if (toolName === "update_output_data") {
|
|
3748
|
+
label = "Update Output (retry)";
|
|
3749
|
+
} else if (toolName === "finish") {
|
|
3750
|
+
label = "Finish (retry)";
|
|
3751
|
+
} else if (toolName === "fail") {
|
|
3752
|
+
label = "Fail (retry)";
|
|
3753
|
+
}
|
|
3754
|
+
options.events?.onStep?.({
|
|
3755
|
+
step: stepCount + 1,
|
|
3756
|
+
total: this.getEstimatedSteps(),
|
|
3757
|
+
label
|
|
3758
|
+
});
|
|
3759
|
+
break;
|
|
3760
|
+
}
|
|
3761
|
+
}
|
|
3762
|
+
} catch (eventHandlerError) {
|
|
3763
|
+
if (this.config.verbose) {
|
|
3764
|
+
console.error(`[AgentStrategy] Error in retry event handler: ${eventHandlerError.message}`);
|
|
3765
|
+
}
|
|
3766
|
+
}
|
|
3767
|
+
});
|
|
3768
|
+
await session.prompt(
|
|
3769
|
+
`You have explored the artifacts but haven't called any output tools yet. You MUST now extract data and call either:
|
|
3770
|
+
1. set_output_data with the extracted data, then finish()
|
|
3771
|
+
2. fail() if the document doesn't contain the required information
|
|
3772
|
+
|
|
3773
|
+
The schema requires: ${JSON.stringify(options.schema).slice(0, 200)}...
|
|
3774
|
+
|
|
3775
|
+
Extract what you can from the artifacts and set the output data NOW.`,
|
|
3776
|
+
{}
|
|
3777
|
+
);
|
|
3778
|
+
retryUnsubscribe();
|
|
3779
|
+
}
|
|
3780
|
+
unsubscribe();
|
|
3781
|
+
const durationMs = Date.now() - startTime;
|
|
3782
|
+
let extractedData;
|
|
3783
|
+
if (extractionFailed) {
|
|
3784
|
+
throw new Error(`Extraction failed: ${failureReason}`);
|
|
3785
|
+
}
|
|
3786
|
+
if (!isFinished) {
|
|
3787
|
+
if (currentOutput !== null) {
|
|
3788
|
+
if (this.config.verbose) {
|
|
3789
|
+
console.error("[AgentStrategy] Warning: Agent did not call finish(). Using collected data.");
|
|
3790
|
+
}
|
|
3791
|
+
const validation = validateData(currentOutput);
|
|
3792
|
+
if (!validation.valid && this.config.verbose) {
|
|
3793
|
+
console.error(`[AgentStrategy] Data validation issues: ${validation.errors.join(", ")}`);
|
|
3794
|
+
}
|
|
3795
|
+
extractedData = currentOutput;
|
|
3796
|
+
} else {
|
|
3797
|
+
const toolCallsMade = stepCount > 0;
|
|
3798
|
+
const toolsFailed = toolCallsMade && currentOutput === null;
|
|
3799
|
+
if (toolsFailed) {
|
|
3800
|
+
const errorMsg = `Agent did not produce any output data. The model may not support tool calling properly.
|
|
3801
|
+
|
|
3802
|
+
Troubleshooting:
|
|
3803
|
+
1. Check if your model supports function calling/tool use
|
|
3804
|
+
2. Try a different model like anthropic/claude-sonnet-4 or openai/gpt-4o
|
|
3805
|
+
3. See MODEL_COMPATIBILITY.md for supported models
|
|
3806
|
+
|
|
3807
|
+
If you continue to see "Tool execution failed" errors with empty tool names,
|
|
3808
|
+
the model is not compatible with the agent strategy. Use --strategy simple instead.`;
|
|
3809
|
+
throw new Error(errorMsg);
|
|
3810
|
+
} else {
|
|
3811
|
+
const errorMsg = `Agent did not produce any output data. No data was extracted.
|
|
3812
|
+
|
|
3813
|
+
This can happen when:
|
|
3814
|
+
- The model doesn't support tool calling properly
|
|
3815
|
+
- The agent got confused and didn't use the output tools
|
|
3816
|
+
- The document doesn't contain extractable data
|
|
3817
|
+
|
|
3818
|
+
Suggestions:
|
|
3819
|
+
1. Try a different model with better tool support (anthropic/claude-sonnet-4)
|
|
3820
|
+
2. Use --strategy simple for models without tool calling
|
|
3821
|
+
3. Check if the document actually contains the data specified in your schema
|
|
3822
|
+
|
|
3823
|
+
Retry was attempted but the agent still didn't produce output.`;
|
|
3824
|
+
throw new Error(errorMsg);
|
|
3825
|
+
}
|
|
3826
|
+
}
|
|
3827
|
+
} else {
|
|
3828
|
+
if (currentOutput === null) {
|
|
3829
|
+
throw new Error("Agent called finish() but no output data was set.");
|
|
3830
|
+
}
|
|
3831
|
+
extractedData = currentOutput;
|
|
3832
|
+
}
|
|
3833
|
+
debug?.rawResponse({ callId, response: extractedData });
|
|
3834
|
+
debug?.llmCallComplete({
|
|
3835
|
+
callId,
|
|
3836
|
+
success: true,
|
|
3837
|
+
inputTokens: usage.inputTokens,
|
|
3838
|
+
outputTokens: usage.outputTokens,
|
|
3839
|
+
totalTokens: usage.totalTokens,
|
|
3840
|
+
durationMs
|
|
3841
|
+
});
|
|
3842
|
+
await options.events?.onStep?.({
|
|
3843
|
+
step: this.getEstimatedSteps(),
|
|
3844
|
+
total: this.getEstimatedSteps(),
|
|
3845
|
+
label: "extract"
|
|
3846
|
+
});
|
|
3847
|
+
debug?.step({
|
|
3848
|
+
step: this.getEstimatedSteps(),
|
|
3849
|
+
total: this.getEstimatedSteps(),
|
|
3850
|
+
label: "extract",
|
|
3851
|
+
strategy: this.name
|
|
3852
|
+
});
|
|
3853
|
+
if (telemetry) {
|
|
3854
|
+
for (const [key, span] of activeMessageSpans.entries()) {
|
|
3855
|
+
telemetry.endSpan(span, { status: "ok" });
|
|
3856
|
+
}
|
|
3857
|
+
activeMessageSpans.clear();
|
|
3858
|
+
for (const [key, span] of activeToolSpans.entries()) {
|
|
3859
|
+
telemetry.endSpan(span, { status: "ok" });
|
|
3860
|
+
}
|
|
3861
|
+
activeToolSpans.clear();
|
|
3862
|
+
if (agentSpan) {
|
|
3863
|
+
telemetry.endSpan(agentSpan, {
|
|
3864
|
+
status: "ok",
|
|
3865
|
+
output: extractedData
|
|
3866
|
+
});
|
|
3867
|
+
}
|
|
3868
|
+
}
|
|
3869
|
+
return { data: extractedData, usage };
|
|
3870
|
+
} catch (error) {
|
|
3871
|
+
const durationMs = Date.now() - startTime;
|
|
3872
|
+
debug?.llmCallComplete({
|
|
3873
|
+
callId,
|
|
3874
|
+
success: false,
|
|
3875
|
+
inputTokens: usage.inputTokens,
|
|
3876
|
+
outputTokens: usage.outputTokens,
|
|
3877
|
+
totalTokens: usage.totalTokens,
|
|
3878
|
+
durationMs,
|
|
3879
|
+
error: error.message
|
|
3880
|
+
});
|
|
3881
|
+
if (telemetry) {
|
|
3882
|
+
for (const [key, span] of activeMessageSpans.entries()) {
|
|
3883
|
+
telemetry.endSpan(span, {
|
|
3884
|
+
status: "error",
|
|
3885
|
+
error: error instanceof Error ? error : new Error(String(error))
|
|
3886
|
+
});
|
|
3887
|
+
}
|
|
3888
|
+
activeMessageSpans.clear();
|
|
3889
|
+
for (const [key, span] of activeToolSpans.entries()) {
|
|
3890
|
+
telemetry.endSpan(span, {
|
|
3891
|
+
status: "error",
|
|
3892
|
+
error: error instanceof Error ? error : new Error(String(error))
|
|
3893
|
+
});
|
|
3894
|
+
}
|
|
3895
|
+
activeToolSpans.clear();
|
|
3896
|
+
if (agentSpan) {
|
|
3897
|
+
telemetry.endSpan(agentSpan, {
|
|
3898
|
+
status: "error",
|
|
3899
|
+
error: error instanceof Error ? error : new Error(String(error))
|
|
3900
|
+
});
|
|
3901
|
+
}
|
|
3902
|
+
}
|
|
3903
|
+
throw error;
|
|
3904
|
+
} finally {
|
|
3905
|
+
session.dispose();
|
|
3906
|
+
}
|
|
3907
|
+
}
|
|
3908
|
+
};
|
|
3909
|
+
var agent = (config) => {
|
|
3910
|
+
return new AgentStrategy(config);
|
|
3911
|
+
};
|
|
3912
|
+
export {
|
|
3913
|
+
AgentStrategy,
|
|
3914
|
+
DoublePassAutoMergeStrategy,
|
|
3915
|
+
DoublePassStrategy,
|
|
3916
|
+
ParallelAutoMergeStrategy,
|
|
3917
|
+
ParallelStrategy,
|
|
3918
|
+
SequentialAutoMergeStrategy,
|
|
3919
|
+
SequentialStrategy,
|
|
3920
|
+
SimpleStrategy,
|
|
3921
|
+
agent,
|
|
3922
|
+
doublePass,
|
|
3923
|
+
doublePassAutoMerge,
|
|
3924
|
+
parallel,
|
|
3925
|
+
parallelAutoMerge,
|
|
3926
|
+
sequential,
|
|
3927
|
+
sequentialAutoMerge,
|
|
3928
|
+
simple
|
|
3929
|
+
};
|
|
3930
|
+
//# sourceMappingURL=strategies.js.map
|