@struktur/sdk 2.1.1 → 2.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +4111 -0
- package/dist/index.js.map +1 -0
- package/dist/parsers.js +492 -0
- package/dist/parsers.js.map +1 -0
- package/dist/strategies.js +2435 -0
- package/dist/strategies.js.map +1 -0
- package/package.json +25 -13
- package/src/agent-cli-integration.test.ts +0 -47
- package/src/agent-export.test.ts +0 -17
- package/src/agent-tool-labels.test.ts +0 -50
- package/src/artifacts/AGENTS.md +0 -16
- package/src/artifacts/fileToArtifact.test.ts +0 -37
- package/src/artifacts/fileToArtifact.ts +0 -44
- package/src/artifacts/input.test.ts +0 -243
- package/src/artifacts/input.ts +0 -360
- package/src/artifacts/providers.test.ts +0 -19
- package/src/artifacts/providers.ts +0 -7
- package/src/artifacts/urlToArtifact.test.ts +0 -23
- package/src/artifacts/urlToArtifact.ts +0 -19
- package/src/auth/AGENTS.md +0 -11
- package/src/auth/config.test.ts +0 -132
- package/src/auth/config.ts +0 -186
- package/src/auth/tokens.test.ts +0 -58
- package/src/auth/tokens.ts +0 -229
- package/src/chunking/AGENTS.md +0 -11
- package/src/chunking/ArtifactBatcher.test.ts +0 -22
- package/src/chunking/ArtifactBatcher.ts +0 -110
- package/src/chunking/ArtifactSplitter.test.ts +0 -38
- package/src/chunking/ArtifactSplitter.ts +0 -151
- package/src/debug/AGENTS.md +0 -79
- package/src/debug/logger.test.ts +0 -244
- package/src/debug/logger.ts +0 -211
- package/src/extract.test.ts +0 -22
- package/src/extract.ts +0 -150
- package/src/fields.test.ts +0 -681
- package/src/fields.ts +0 -246
- package/src/index.test.ts +0 -20
- package/src/index.ts +0 -110
- package/src/llm/AGENTS.md +0 -9
- package/src/llm/LLMClient.test.ts +0 -394
- package/src/llm/LLMClient.ts +0 -264
- package/src/llm/RetryingRunner.test.ts +0 -174
- package/src/llm/RetryingRunner.ts +0 -270
- package/src/llm/message.test.ts +0 -42
- package/src/llm/message.ts +0 -47
- package/src/llm/models.test.ts +0 -82
- package/src/llm/models.ts +0 -190
- package/src/llm/resolveModel.ts +0 -86
- package/src/merge/AGENTS.md +0 -6
- package/src/merge/Deduplicator.test.ts +0 -108
- package/src/merge/Deduplicator.ts +0 -45
- package/src/merge/SmartDataMerger.test.ts +0 -177
- package/src/merge/SmartDataMerger.ts +0 -56
- package/src/parsers/AGENTS.md +0 -58
- package/src/parsers/collect.test.ts +0 -56
- package/src/parsers/collect.ts +0 -31
- package/src/parsers/index.ts +0 -6
- package/src/parsers/mime.test.ts +0 -91
- package/src/parsers/mime.ts +0 -137
- package/src/parsers/npm.ts +0 -26
- package/src/parsers/pdf.test.ts +0 -394
- package/src/parsers/pdf.ts +0 -194
- package/src/parsers/runner.test.ts +0 -95
- package/src/parsers/runner.ts +0 -177
- package/src/parsers/types.ts +0 -29
- package/src/prompts/AGENTS.md +0 -8
- package/src/prompts/DeduplicationPrompt.test.ts +0 -41
- package/src/prompts/DeduplicationPrompt.ts +0 -37
- package/src/prompts/ExtractorPrompt.test.ts +0 -21
- package/src/prompts/ExtractorPrompt.ts +0 -72
- package/src/prompts/ParallelMergerPrompt.test.ts +0 -8
- package/src/prompts/ParallelMergerPrompt.ts +0 -37
- package/src/prompts/SequentialExtractorPrompt.test.ts +0 -24
- package/src/prompts/SequentialExtractorPrompt.ts +0 -82
- package/src/prompts/formatArtifacts.test.ts +0 -39
- package/src/prompts/formatArtifacts.ts +0 -46
- package/src/strategies/AGENTS.md +0 -6
- package/src/strategies/DoublePassAutoMergeStrategy.test.ts +0 -53
- package/src/strategies/DoublePassAutoMergeStrategy.ts +0 -410
- package/src/strategies/DoublePassStrategy.test.ts +0 -48
- package/src/strategies/DoublePassStrategy.ts +0 -266
- package/src/strategies/ParallelAutoMergeStrategy.test.ts +0 -152
- package/src/strategies/ParallelAutoMergeStrategy.ts +0 -345
- package/src/strategies/ParallelStrategy.test.ts +0 -61
- package/src/strategies/ParallelStrategy.ts +0 -208
- package/src/strategies/SequentialAutoMergeStrategy.test.ts +0 -66
- package/src/strategies/SequentialAutoMergeStrategy.ts +0 -325
- package/src/strategies/SequentialStrategy.test.ts +0 -53
- package/src/strategies/SequentialStrategy.ts +0 -142
- package/src/strategies/SimpleStrategy.test.ts +0 -46
- package/src/strategies/SimpleStrategy.ts +0 -94
- package/src/strategies/concurrency.test.ts +0 -16
- package/src/strategies/concurrency.ts +0 -14
- package/src/strategies/index.test.ts +0 -20
- package/src/strategies/index.ts +0 -7
- package/src/strategies/utils.test.ts +0 -76
- package/src/strategies/utils.ts +0 -95
- package/src/tokenization.test.ts +0 -119
- package/src/tokenization.ts +0 -71
- package/src/types.test.ts +0 -25
- package/src/types.ts +0 -174
- package/src/validation/AGENTS.md +0 -7
- package/src/validation/validator.test.ts +0 -204
- package/src/validation/validator.ts +0 -90
- package/tsconfig.json +0 -22
package/src/parsers/pdf.test.ts
DELETED
|
@@ -1,394 +0,0 @@
|
|
|
1
|
-
import { test, expect, mock } from "bun:test";
|
|
2
|
-
|
|
3
|
-
// ---------------------------------------------------------------------------
|
|
4
|
-
// Stub for pdf-parse: we control what getText() and getImage() return so we
|
|
5
|
-
// can test parsePdf without a real PDF file.
|
|
6
|
-
// ---------------------------------------------------------------------------
|
|
7
|
-
|
|
8
|
-
type TextPage = { num: number; text: string };
|
|
9
|
-
type EmbeddedImageStub = { dataUrl: string; width: number; height: number };
|
|
10
|
-
type PageImagesStub = { pageNumber: number; images: EmbeddedImageStub[] };
|
|
11
|
-
type ScreenshotPageStub = { pageNumber: number; dataUrl: string; width: number; height: number };
|
|
12
|
-
|
|
13
|
-
// Configurable stubs — tests update these before importing parsePdf.
|
|
14
|
-
let stubTextPages: TextPage[] = [];
|
|
15
|
-
let stubTextFull = "";
|
|
16
|
-
let stubImagePages: PageImagesStub[] = [];
|
|
17
|
-
let stubScreenshotPages: ScreenshotPageStub[] = [];
|
|
18
|
-
let stubGetImageThrows = false;
|
|
19
|
-
let stubGetScreenshotThrows = false;
|
|
20
|
-
|
|
21
|
-
mock.module("pdf-parse", () => ({
|
|
22
|
-
PDFParse: class {
|
|
23
|
-
constructor(_opts: unknown) {}
|
|
24
|
-
async getText() {
|
|
25
|
-
return {
|
|
26
|
-
pages: stubTextPages,
|
|
27
|
-
text: stubTextFull,
|
|
28
|
-
total: stubTextPages.length || 1,
|
|
29
|
-
};
|
|
30
|
-
}
|
|
31
|
-
async getImage(_params?: unknown) {
|
|
32
|
-
if (stubGetImageThrows) throw new Error("image extraction failed");
|
|
33
|
-
return {
|
|
34
|
-
pages: stubImagePages,
|
|
35
|
-
total: stubImagePages.length,
|
|
36
|
-
};
|
|
37
|
-
}
|
|
38
|
-
async getScreenshot(_params?: unknown) {
|
|
39
|
-
if (stubGetScreenshotThrows) throw new Error("screenshot rendering failed");
|
|
40
|
-
return {
|
|
41
|
-
pages: stubScreenshotPages,
|
|
42
|
-
total: stubScreenshotPages.length,
|
|
43
|
-
};
|
|
44
|
-
}
|
|
45
|
-
async getInfo() {
|
|
46
|
-
return { Title: "Test PDF" };
|
|
47
|
-
}
|
|
48
|
-
async destroy() {}
|
|
49
|
-
},
|
|
50
|
-
}));
|
|
51
|
-
|
|
52
|
-
// Import after mock is registered
|
|
53
|
-
const { parsePdf } = await import("./pdf");
|
|
54
|
-
|
|
55
|
-
// ---------------------------------------------------------------------------
|
|
56
|
-
// Helpers
|
|
57
|
-
// ---------------------------------------------------------------------------
|
|
58
|
-
|
|
59
|
-
function makeBuffer() {
|
|
60
|
-
return Buffer.from("%PDF-1.4 fake");
|
|
61
|
-
}
|
|
62
|
-
|
|
63
|
-
// ---------------------------------------------------------------------------
|
|
64
|
-
// Tests
|
|
65
|
-
// ---------------------------------------------------------------------------
|
|
66
|
-
|
|
67
|
-
test("parsePdf extracts per-page text when pages are present", async () => {
|
|
68
|
-
stubTextPages = [
|
|
69
|
-
{ num: 1, text: "Hello page one" },
|
|
70
|
-
{ num: 2, text: "Hello page two" },
|
|
71
|
-
];
|
|
72
|
-
stubTextFull = "Hello page one\nHello page two";
|
|
73
|
-
stubImagePages = [];
|
|
74
|
-
stubScreenshotPages = [];
|
|
75
|
-
stubGetImageThrows = false;
|
|
76
|
-
stubGetScreenshotThrows = false;
|
|
77
|
-
|
|
78
|
-
const artifact = await parsePdf(makeBuffer());
|
|
79
|
-
|
|
80
|
-
expect(artifact.type).toBe("pdf");
|
|
81
|
-
expect(artifact.contents).toHaveLength(2);
|
|
82
|
-
expect(artifact.contents[0]).toEqual({ page: 1, text: "Hello page one" });
|
|
83
|
-
expect(artifact.contents[1]).toEqual({ page: 2, text: "Hello page two" });
|
|
84
|
-
});
|
|
85
|
-
|
|
86
|
-
test("parsePdf falls back to full text when no pages are returned", async () => {
|
|
87
|
-
stubTextPages = [];
|
|
88
|
-
stubTextFull = "entire document text";
|
|
89
|
-
stubImagePages = [];
|
|
90
|
-
stubScreenshotPages = [];
|
|
91
|
-
stubGetImageThrows = false;
|
|
92
|
-
stubGetScreenshotThrows = false;
|
|
93
|
-
|
|
94
|
-
const artifact = await parsePdf(makeBuffer());
|
|
95
|
-
|
|
96
|
-
expect(artifact.contents).toHaveLength(1);
|
|
97
|
-
expect(artifact.contents[0]?.text).toBe("entire document text");
|
|
98
|
-
expect(artifact.contents[0]?.page).toBeUndefined();
|
|
99
|
-
});
|
|
100
|
-
|
|
101
|
-
test("parsePdf attaches images to the matching page content entry", async () => {
|
|
102
|
-
stubTextPages = [
|
|
103
|
-
{ num: 1, text: "Page with image" },
|
|
104
|
-
{ num: 2, text: "Page without image" },
|
|
105
|
-
];
|
|
106
|
-
stubTextFull = "";
|
|
107
|
-
stubImagePages = [
|
|
108
|
-
{
|
|
109
|
-
pageNumber: 1,
|
|
110
|
-
images: [{ dataUrl: "data:image/png;base64,abc123", width: 100, height: 50 }],
|
|
111
|
-
},
|
|
112
|
-
];
|
|
113
|
-
stubScreenshotPages = [];
|
|
114
|
-
stubGetImageThrows = false;
|
|
115
|
-
stubGetScreenshotThrows = false;
|
|
116
|
-
|
|
117
|
-
const artifact = await parsePdf(makeBuffer());
|
|
118
|
-
|
|
119
|
-
expect(artifact.contents).toHaveLength(2);
|
|
120
|
-
|
|
121
|
-
const page1 = artifact.contents[0]!;
|
|
122
|
-
expect(page1.page).toBe(1);
|
|
123
|
-
expect(page1.text).toBe("Page with image");
|
|
124
|
-
expect(page1.media).toHaveLength(1);
|
|
125
|
-
expect(page1.media![0]).toMatchObject({
|
|
126
|
-
type: "image",
|
|
127
|
-
base64: "abc123",
|
|
128
|
-
width: 100,
|
|
129
|
-
height: 50,
|
|
130
|
-
});
|
|
131
|
-
|
|
132
|
-
const page2 = artifact.contents[1]!;
|
|
133
|
-
expect(page2.page).toBe(2);
|
|
134
|
-
expect(page2.media).toBeUndefined();
|
|
135
|
-
});
|
|
136
|
-
|
|
137
|
-
test("parsePdf strips data URL prefix to produce raw base64", async () => {
|
|
138
|
-
stubTextPages = [{ num: 1, text: "text" }];
|
|
139
|
-
stubTextFull = "";
|
|
140
|
-
stubImagePages = [
|
|
141
|
-
{
|
|
142
|
-
pageNumber: 1,
|
|
143
|
-
images: [{ dataUrl: "data:image/jpeg;base64,/9j/4AAQ==", width: 200, height: 200 }],
|
|
144
|
-
},
|
|
145
|
-
];
|
|
146
|
-
stubScreenshotPages = [];
|
|
147
|
-
stubGetImageThrows = false;
|
|
148
|
-
stubGetScreenshotThrows = false;
|
|
149
|
-
|
|
150
|
-
const artifact = await parsePdf(makeBuffer());
|
|
151
|
-
const img = artifact.contents[0]?.media?.[0];
|
|
152
|
-
expect(img?.base64).toBe("/9j/4AAQ==");
|
|
153
|
-
});
|
|
154
|
-
|
|
155
|
-
test("parsePdf creates a content entry for pages that have only images (no text)", async () => {
|
|
156
|
-
stubTextPages = [{ num: 1, text: "text only page" }];
|
|
157
|
-
stubTextFull = "";
|
|
158
|
-
stubImagePages = [
|
|
159
|
-
{
|
|
160
|
-
pageNumber: 2,
|
|
161
|
-
images: [{ dataUrl: "data:image/png;base64,img2", width: 80, height: 80 }],
|
|
162
|
-
},
|
|
163
|
-
];
|
|
164
|
-
stubScreenshotPages = [];
|
|
165
|
-
stubGetImageThrows = false;
|
|
166
|
-
stubGetScreenshotThrows = false;
|
|
167
|
-
|
|
168
|
-
const artifact = await parsePdf(makeBuffer());
|
|
169
|
-
|
|
170
|
-
// Should have page 1 (text) and page 2 (image-only)
|
|
171
|
-
expect(artifact.contents).toHaveLength(2);
|
|
172
|
-
|
|
173
|
-
const imagePage = artifact.contents.find((c) => c.page === 2);
|
|
174
|
-
expect(imagePage).toBeDefined();
|
|
175
|
-
expect(imagePage?.text).toBeUndefined();
|
|
176
|
-
expect(imagePage?.media).toHaveLength(1);
|
|
177
|
-
expect(imagePage?.media![0]?.base64).toBe("img2");
|
|
178
|
-
});
|
|
179
|
-
|
|
180
|
-
test("parsePdf continues without images when getImage() throws", async () => {
|
|
181
|
-
stubTextPages = [{ num: 1, text: "resilient page" }];
|
|
182
|
-
stubTextFull = "";
|
|
183
|
-
stubImagePages = [];
|
|
184
|
-
stubScreenshotPages = [];
|
|
185
|
-
stubGetImageThrows = true;
|
|
186
|
-
stubGetScreenshotThrows = false;
|
|
187
|
-
|
|
188
|
-
const artifact = await parsePdf(makeBuffer());
|
|
189
|
-
|
|
190
|
-
expect(artifact.contents).toHaveLength(1);
|
|
191
|
-
expect(artifact.contents[0]?.text).toBe("resilient page");
|
|
192
|
-
expect(artifact.contents[0]?.media).toBeUndefined();
|
|
193
|
-
});
|
|
194
|
-
|
|
195
|
-
test("parsePdf produces at least one content entry for empty documents", async () => {
|
|
196
|
-
stubTextPages = [];
|
|
197
|
-
stubTextFull = "";
|
|
198
|
-
stubImagePages = [];
|
|
199
|
-
stubScreenshotPages = [];
|
|
200
|
-
stubGetImageThrows = false;
|
|
201
|
-
stubGetScreenshotThrows = false;
|
|
202
|
-
|
|
203
|
-
const artifact = await parsePdf(makeBuffer());
|
|
204
|
-
|
|
205
|
-
expect(artifact.contents).toHaveLength(1);
|
|
206
|
-
expect(artifact.contents[0]?.text).toBe("");
|
|
207
|
-
});
|
|
208
|
-
|
|
209
|
-
test("parsePdf includes numpages in metadata", async () => {
|
|
210
|
-
stubTextPages = [{ num: 1, text: "one" }];
|
|
211
|
-
stubTextFull = "";
|
|
212
|
-
stubImagePages = [];
|
|
213
|
-
stubScreenshotPages = [];
|
|
214
|
-
stubGetImageThrows = false;
|
|
215
|
-
stubGetScreenshotThrows = false;
|
|
216
|
-
|
|
217
|
-
const artifact = await parsePdf(makeBuffer());
|
|
218
|
-
|
|
219
|
-
expect(artifact.metadata?.numpages).toBe(1);
|
|
220
|
-
expect((artifact.metadata?.info as Record<string, unknown>)?.Title).toBe("Test PDF");
|
|
221
|
-
});
|
|
222
|
-
|
|
223
|
-
test("parsePdf raw() returns the original buffer", async () => {
|
|
224
|
-
stubTextPages = [{ num: 1, text: "raw test" }];
|
|
225
|
-
stubTextFull = "";
|
|
226
|
-
stubImagePages = [];
|
|
227
|
-
stubScreenshotPages = [];
|
|
228
|
-
stubGetImageThrows = false;
|
|
229
|
-
stubGetScreenshotThrows = false;
|
|
230
|
-
|
|
231
|
-
const buf = makeBuffer();
|
|
232
|
-
const artifact = await parsePdf(buf);
|
|
233
|
-
const raw = await artifact.raw();
|
|
234
|
-
expect(raw).toBe(buf);
|
|
235
|
-
});
|
|
236
|
-
|
|
237
|
-
test("parsePdf with includeImages: false skips image extraction", async () => {
|
|
238
|
-
stubTextPages = [{ num: 1, text: "text only" }];
|
|
239
|
-
stubTextFull = "";
|
|
240
|
-
stubImagePages = [
|
|
241
|
-
{
|
|
242
|
-
pageNumber: 1,
|
|
243
|
-
images: [{ dataUrl: "data:image/png;base64,abc123", width: 100, height: 50 }],
|
|
244
|
-
},
|
|
245
|
-
];
|
|
246
|
-
stubScreenshotPages = [];
|
|
247
|
-
stubGetImageThrows = false;
|
|
248
|
-
stubGetScreenshotThrows = false;
|
|
249
|
-
|
|
250
|
-
// Even though stubImagePages has data, passing includeImages: false should
|
|
251
|
-
// cause getImage() to not be called, so no media on the content entry.
|
|
252
|
-
const artifact = await parsePdf(makeBuffer(), { includeImages: false });
|
|
253
|
-
|
|
254
|
-
expect(artifact.contents).toHaveLength(1);
|
|
255
|
-
expect(artifact.contents[0]?.text).toBe("text only");
|
|
256
|
-
expect(artifact.contents[0]?.media).toBeUndefined();
|
|
257
|
-
});
|
|
258
|
-
|
|
259
|
-
test("parsePdf with includeImages: true behaves the same as the default", async () => {
|
|
260
|
-
stubTextPages = [{ num: 1, text: "with images" }];
|
|
261
|
-
stubTextFull = "";
|
|
262
|
-
stubImagePages = [
|
|
263
|
-
{
|
|
264
|
-
pageNumber: 1,
|
|
265
|
-
images: [{ dataUrl: "data:image/png;base64,xyz456", width: 80, height: 80 }],
|
|
266
|
-
},
|
|
267
|
-
];
|
|
268
|
-
stubScreenshotPages = [];
|
|
269
|
-
stubGetImageThrows = false;
|
|
270
|
-
stubGetScreenshotThrows = false;
|
|
271
|
-
|
|
272
|
-
const artifact = await parsePdf(makeBuffer(), { includeImages: true });
|
|
273
|
-
|
|
274
|
-
expect(artifact.contents[0]?.media).toHaveLength(1);
|
|
275
|
-
expect(artifact.contents[0]?.media![0]?.base64).toBe("xyz456");
|
|
276
|
-
});
|
|
277
|
-
|
|
278
|
-
test("parsePdf with no options still includes images by default", async () => {
|
|
279
|
-
stubTextPages = [{ num: 1, text: "default" }];
|
|
280
|
-
stubTextFull = "";
|
|
281
|
-
stubImagePages = [
|
|
282
|
-
{
|
|
283
|
-
pageNumber: 1,
|
|
284
|
-
images: [{ dataUrl: "data:image/png;base64,def789", width: 60, height: 60 }],
|
|
285
|
-
},
|
|
286
|
-
];
|
|
287
|
-
stubScreenshotPages = [];
|
|
288
|
-
stubGetImageThrows = false;
|
|
289
|
-
stubGetScreenshotThrows = false;
|
|
290
|
-
|
|
291
|
-
const artifact = await parsePdf(makeBuffer());
|
|
292
|
-
|
|
293
|
-
expect(artifact.contents[0]?.media).toHaveLength(1);
|
|
294
|
-
expect(artifact.contents[0]?.media![0]?.base64).toBe("def789");
|
|
295
|
-
});
|
|
296
|
-
|
|
297
|
-
test("parsePdf marks embedded images with imageType: 'embedded'", async () => {
|
|
298
|
-
stubTextPages = [{ num: 1, text: "page with embedded image" }];
|
|
299
|
-
stubTextFull = "";
|
|
300
|
-
stubImagePages = [
|
|
301
|
-
{
|
|
302
|
-
pageNumber: 1,
|
|
303
|
-
images: [{ dataUrl: "data:image/png;base64,embedded123", width: 100, height: 100 }],
|
|
304
|
-
},
|
|
305
|
-
];
|
|
306
|
-
stubScreenshotPages = [];
|
|
307
|
-
stubGetImageThrows = false;
|
|
308
|
-
stubGetScreenshotThrows = false;
|
|
309
|
-
|
|
310
|
-
const artifact = await parsePdf(makeBuffer());
|
|
311
|
-
|
|
312
|
-
expect(artifact.contents[0]?.media).toHaveLength(1);
|
|
313
|
-
expect(artifact.contents[0]?.media![0]?.imageType).toBe("embedded");
|
|
314
|
-
});
|
|
315
|
-
|
|
316
|
-
test("parsePdf marks screenshots with imageType: 'screenshot'", async () => {
|
|
317
|
-
stubTextPages = [{ num: 1, text: "page with screenshot" }];
|
|
318
|
-
stubTextFull = "";
|
|
319
|
-
stubImagePages = [];
|
|
320
|
-
stubScreenshotPages = [
|
|
321
|
-
{ pageNumber: 1, dataUrl: "data:image/png;base64,screenshot456", width: 800, height: 600 },
|
|
322
|
-
];
|
|
323
|
-
stubGetImageThrows = false;
|
|
324
|
-
stubGetScreenshotThrows = false;
|
|
325
|
-
|
|
326
|
-
const artifact = await parsePdf(makeBuffer(), { screenshots: true });
|
|
327
|
-
|
|
328
|
-
expect(artifact.contents[0]?.media).toHaveLength(1);
|
|
329
|
-
expect(artifact.contents[0]?.media![0]?.imageType).toBe("screenshot");
|
|
330
|
-
});
|
|
331
|
-
|
|
332
|
-
test("parsePdf correctly differentiates embedded images and screenshots on the same page", async () => {
|
|
333
|
-
stubTextPages = [{ num: 1, text: "page with both" }];
|
|
334
|
-
stubTextFull = "";
|
|
335
|
-
stubImagePages = [
|
|
336
|
-
{
|
|
337
|
-
pageNumber: 1,
|
|
338
|
-
images: [{ dataUrl: "data:image/png;base64,embedded789", width: 100, height: 100 }],
|
|
339
|
-
},
|
|
340
|
-
];
|
|
341
|
-
stubScreenshotPages = [
|
|
342
|
-
{ pageNumber: 1, dataUrl: "data:image/png;base64,screenshot012", width: 800, height: 600 },
|
|
343
|
-
];
|
|
344
|
-
stubGetImageThrows = false;
|
|
345
|
-
stubGetScreenshotThrows = false;
|
|
346
|
-
|
|
347
|
-
const artifact = await parsePdf(makeBuffer(), { screenshots: true });
|
|
348
|
-
|
|
349
|
-
expect(artifact.contents[0]?.media).toHaveLength(2);
|
|
350
|
-
|
|
351
|
-
const embeddedImage = artifact.contents[0]?.media?.find(img => img.imageType === "embedded");
|
|
352
|
-
const screenshotImage = artifact.contents[0]?.media?.find(img => img.imageType === "screenshot");
|
|
353
|
-
|
|
354
|
-
expect(embeddedImage).toBeDefined();
|
|
355
|
-
expect(embeddedImage?.base64).toBe("embedded789");
|
|
356
|
-
expect(screenshotImage).toBeDefined();
|
|
357
|
-
expect(screenshotImage?.base64).toBe("screenshot012");
|
|
358
|
-
});
|
|
359
|
-
|
|
360
|
-
test("parsePdf without screenshots option does not include screenshot images", async () => {
|
|
361
|
-
stubTextPages = [{ num: 1, text: "page" }];
|
|
362
|
-
stubTextFull = "";
|
|
363
|
-
stubImagePages = [
|
|
364
|
-
{
|
|
365
|
-
pageNumber: 1,
|
|
366
|
-
images: [{ dataUrl: "data:image/png;base64,embedded345", width: 100, height: 100 }],
|
|
367
|
-
},
|
|
368
|
-
];
|
|
369
|
-
stubScreenshotPages = [
|
|
370
|
-
{ pageNumber: 1, dataUrl: "data:image/png;base64,screenshot678", width: 800, height: 600 },
|
|
371
|
-
];
|
|
372
|
-
stubGetImageThrows = false;
|
|
373
|
-
stubGetScreenshotThrows = false;
|
|
374
|
-
|
|
375
|
-
const artifact = await parsePdf(makeBuffer());
|
|
376
|
-
|
|
377
|
-
expect(artifact.contents[0]?.media).toHaveLength(1);
|
|
378
|
-
expect(artifact.contents[0]?.media![0]?.imageType).toBe("embedded");
|
|
379
|
-
});
|
|
380
|
-
|
|
381
|
-
test("parsePdf continues without screenshots when getScreenshot() throws", async () => {
|
|
382
|
-
stubTextPages = [{ num: 1, text: "resilient page" }];
|
|
383
|
-
stubTextFull = "";
|
|
384
|
-
stubImagePages = [];
|
|
385
|
-
stubScreenshotPages = [];
|
|
386
|
-
stubGetImageThrows = false;
|
|
387
|
-
stubGetScreenshotThrows = true;
|
|
388
|
-
|
|
389
|
-
const artifact = await parsePdf(makeBuffer(), { screenshots: true });
|
|
390
|
-
|
|
391
|
-
expect(artifact.contents).toHaveLength(1);
|
|
392
|
-
expect(artifact.contents[0]?.text).toBe("resilient page");
|
|
393
|
-
expect(artifact.contents[0]?.media).toBeUndefined();
|
|
394
|
-
});
|
package/src/parsers/pdf.ts
DELETED
|
@@ -1,194 +0,0 @@
|
|
|
1
|
-
import type { Artifact, ArtifactContent, ArtifactImage } from "../types";
|
|
2
|
-
import { collectStream } from "./collect";
|
|
3
|
-
|
|
4
|
-
export type ParsePdfOptions = {
|
|
5
|
-
/**
|
|
6
|
-
* Whether to extract embedded images from each page and include them as
|
|
7
|
-
* base64-encoded ArtifactImage entries in the media field.
|
|
8
|
-
* Defaults to true. Pass false to skip image extraction entirely.
|
|
9
|
-
*/
|
|
10
|
-
includeImages?: boolean;
|
|
11
|
-
/**
|
|
12
|
-
* Whether to render page screenshots and include them as ArtifactImage entries.
|
|
13
|
-
* When true, each page is rendered to a PNG image and added to the media field.
|
|
14
|
-
* Defaults to false.
|
|
15
|
-
*/
|
|
16
|
-
screenshots?: boolean;
|
|
17
|
-
/**
|
|
18
|
-
* Scale factor for screenshots. Higher values produce larger, higher-quality images.
|
|
19
|
-
* Defaults to 1.5.
|
|
20
|
-
*/
|
|
21
|
-
screenshotScale?: number;
|
|
22
|
-
/**
|
|
23
|
-
* Target width in pixels for screenshots. If specified, takes precedence over screenshotScale.
|
|
24
|
-
* Height is calculated to maintain aspect ratio.
|
|
25
|
-
*/
|
|
26
|
-
screenshotWidth?: number;
|
|
27
|
-
};
|
|
28
|
-
|
|
29
|
-
/**
|
|
30
|
-
* Built-in PDF parser using pdf-parse.
|
|
31
|
-
*
|
|
32
|
-
* Accepts a Buffer or ReadableStream<Uint8Array> and extracts text per-page
|
|
33
|
-
* into ArtifactContent[] with page numbers set. Embedded images on each page
|
|
34
|
-
* are extracted and included as base64-encoded ArtifactImage entries in the
|
|
35
|
-
* media field of the corresponding content block (unless includeImages is
|
|
36
|
-
* false). Returns an Artifact with type: "pdf".
|
|
37
|
-
*/
|
|
38
|
-
export async function parsePdf(
|
|
39
|
-
input: Buffer | ReadableStream<Uint8Array>,
|
|
40
|
-
options?: ParsePdfOptions,
|
|
41
|
-
): Promise<Artifact> {
|
|
42
|
-
const buffer = Buffer.isBuffer(input) ? input : await collectStream(input);
|
|
43
|
-
|
|
44
|
-
// Dynamic import to avoid bundling issues
|
|
45
|
-
const { PDFParse } = await import("pdf-parse");
|
|
46
|
-
|
|
47
|
-
const parser = new PDFParse({ data: buffer });
|
|
48
|
-
const textResult = await parser.getText();
|
|
49
|
-
|
|
50
|
-
// Build a page-number → text map from per-page results
|
|
51
|
-
const pageTextMap = new Map<number, string>();
|
|
52
|
-
if (textResult.pages.length > 0) {
|
|
53
|
-
for (const page of textResult.pages) {
|
|
54
|
-
if (page.text && page.text.trim().length > 0) {
|
|
55
|
-
pageTextMap.set(page.num, page.text);
|
|
56
|
-
}
|
|
57
|
-
}
|
|
58
|
-
}
|
|
59
|
-
|
|
60
|
-
// Extract embedded images unless the caller opted out.
|
|
61
|
-
// imageBuffer=false saves memory (we only need the data URL).
|
|
62
|
-
let imageResult;
|
|
63
|
-
if (options?.includeImages !== false) {
|
|
64
|
-
try {
|
|
65
|
-
imageResult = await parser.getImage({ imageBuffer: false, imageDataUrl: true });
|
|
66
|
-
} catch {
|
|
67
|
-
// Image extraction is optional — continue without images if it fails
|
|
68
|
-
}
|
|
69
|
-
}
|
|
70
|
-
|
|
71
|
-
// Render page screenshots if requested
|
|
72
|
-
let screenshotResult;
|
|
73
|
-
if (options?.screenshots === true) {
|
|
74
|
-
try {
|
|
75
|
-
const screenshotParams: {
|
|
76
|
-
imageBuffer: boolean;
|
|
77
|
-
imageDataUrl: boolean;
|
|
78
|
-
scale?: number;
|
|
79
|
-
desiredWidth?: number;
|
|
80
|
-
} = { imageBuffer: false, imageDataUrl: true };
|
|
81
|
-
|
|
82
|
-
if (options.screenshotWidth !== undefined) {
|
|
83
|
-
screenshotParams.desiredWidth = options.screenshotWidth;
|
|
84
|
-
} else {
|
|
85
|
-
screenshotParams.scale = options.screenshotScale ?? 1.5;
|
|
86
|
-
}
|
|
87
|
-
|
|
88
|
-
screenshotResult = await parser.getScreenshot(screenshotParams);
|
|
89
|
-
} catch {
|
|
90
|
-
// Screenshot rendering is optional — continue without screenshots if it fails
|
|
91
|
-
}
|
|
92
|
-
}
|
|
93
|
-
|
|
94
|
-
// Build a page-number → ArtifactImage[] map from extracted images
|
|
95
|
-
const pageImageMap = new Map<number, ArtifactImage[]>();
|
|
96
|
-
if (imageResult) {
|
|
97
|
-
for (const pageImages of imageResult.pages) {
|
|
98
|
-
const artifactImages: ArtifactImage[] = pageImages.images
|
|
99
|
-
.filter((img) => img.dataUrl)
|
|
100
|
-
.map((img) => {
|
|
101
|
-
// Strip the "data:<mime>;base64," prefix to get the raw base64 string
|
|
102
|
-
const base64 = img.dataUrl.replace(/^data:[^;]+;base64,/, "");
|
|
103
|
-
const artifactImage: ArtifactImage = {
|
|
104
|
-
type: "image",
|
|
105
|
-
base64,
|
|
106
|
-
width: img.width,
|
|
107
|
-
height: img.height,
|
|
108
|
-
imageType: "embedded",
|
|
109
|
-
};
|
|
110
|
-
return artifactImage;
|
|
111
|
-
});
|
|
112
|
-
if (artifactImages.length > 0) {
|
|
113
|
-
pageImageMap.set(pageImages.pageNumber, artifactImages);
|
|
114
|
-
}
|
|
115
|
-
}
|
|
116
|
-
}
|
|
117
|
-
|
|
118
|
-
// Add screenshots to the pageImageMap
|
|
119
|
-
if (screenshotResult) {
|
|
120
|
-
for (const screenshot of screenshotResult.pages) {
|
|
121
|
-
if (screenshot.dataUrl) {
|
|
122
|
-
// Strip the "data:<mime>;base64," prefix to get the raw base64 string
|
|
123
|
-
const base64 = screenshot.dataUrl.replace(/^data:[^;]+;base64,/, "");
|
|
124
|
-
const artifactImage: ArtifactImage = {
|
|
125
|
-
type: "image",
|
|
126
|
-
base64,
|
|
127
|
-
width: screenshot.width,
|
|
128
|
-
height: screenshot.height,
|
|
129
|
-
imageType: "screenshot",
|
|
130
|
-
};
|
|
131
|
-
// Append to existing images for this page, or create new entry
|
|
132
|
-
const existing = pageImageMap.get(screenshot.pageNumber) ?? [];
|
|
133
|
-
pageImageMap.set(screenshot.pageNumber, [...existing, artifactImage]);
|
|
134
|
-
}
|
|
135
|
-
}
|
|
136
|
-
}
|
|
137
|
-
|
|
138
|
-
let contents: ArtifactContent[];
|
|
139
|
-
|
|
140
|
-
if (textResult.pages.length > 0) {
|
|
141
|
-
// Collect all page numbers that have text or images
|
|
142
|
-
const allPageNums = new Set<number>([
|
|
143
|
-
...pageTextMap.keys(),
|
|
144
|
-
...pageImageMap.keys(),
|
|
145
|
-
]);
|
|
146
|
-
|
|
147
|
-
contents = Array.from(allPageNums)
|
|
148
|
-
.sort((a, b) => a - b)
|
|
149
|
-
.map((pageNum) => {
|
|
150
|
-
const entry: ArtifactContent = { page: pageNum };
|
|
151
|
-
const text = pageTextMap.get(pageNum);
|
|
152
|
-
if (text) entry.text = text;
|
|
153
|
-
const media = pageImageMap.get(pageNum);
|
|
154
|
-
if (media) entry.media = media;
|
|
155
|
-
return entry;
|
|
156
|
-
});
|
|
157
|
-
} else {
|
|
158
|
-
// Fallback: no per-page info — use full concatenated text
|
|
159
|
-
const entry: ArtifactContent = { text: textResult.text };
|
|
160
|
-
// Attach any images from the first page if available
|
|
161
|
-
const firstPageImages = pageImageMap.size > 0
|
|
162
|
-
? pageImageMap.values().next().value
|
|
163
|
-
: undefined;
|
|
164
|
-
if (firstPageImages) entry.media = firstPageImages;
|
|
165
|
-
contents = [entry];
|
|
166
|
-
}
|
|
167
|
-
|
|
168
|
-
// Ensure we have at least one content entry
|
|
169
|
-
if (contents.length === 0) {
|
|
170
|
-
contents = [{ text: "" }];
|
|
171
|
-
}
|
|
172
|
-
|
|
173
|
-
let infoResult;
|
|
174
|
-
try {
|
|
175
|
-
infoResult = await parser.getInfo();
|
|
176
|
-
} catch {
|
|
177
|
-
// Info extraction is optional
|
|
178
|
-
}
|
|
179
|
-
|
|
180
|
-
await parser.destroy();
|
|
181
|
-
|
|
182
|
-
return {
|
|
183
|
-
id: `artifact-${crypto.randomUUID()}`,
|
|
184
|
-
type: "pdf",
|
|
185
|
-
raw: async () => buffer,
|
|
186
|
-
contents,
|
|
187
|
-
metadata: infoResult
|
|
188
|
-
? {
|
|
189
|
-
numpages: textResult.total,
|
|
190
|
-
info: infoResult,
|
|
191
|
-
}
|
|
192
|
-
: { numpages: textResult.total },
|
|
193
|
-
};
|
|
194
|
-
}
|
|
@@ -1,95 +0,0 @@
|
|
|
1
|
-
import { test, expect } from "bun:test";
|
|
2
|
-
import { runParser } from "./runner";
|
|
3
|
-
import type { ParserDef, ParserInput } from "./types";
|
|
4
|
-
import path from "node:path";
|
|
5
|
-
import os from "node:os";
|
|
6
|
-
import { rm, writeFile } from "node:fs/promises";
|
|
7
|
-
|
|
8
|
-
// command-file: echo a serialized artifact JSON
|
|
9
|
-
const makeArtifactJson = () => {
|
|
10
|
-
const artifact = JSON.stringify([{
|
|
11
|
-
id: "test-1",
|
|
12
|
-
type: "text",
|
|
13
|
-
contents: [{ text: "parsed text" }],
|
|
14
|
-
}]);
|
|
15
|
-
return artifact;
|
|
16
|
-
};
|
|
17
|
-
|
|
18
|
-
test("command-stdin runner pipes buffer to stdin and parses output", async () => {
|
|
19
|
-
// Use a command that outputs valid artifact JSON ignoring stdin
|
|
20
|
-
const artifactJson = makeArtifactJson();
|
|
21
|
-
const def: ParserDef = {
|
|
22
|
-
type: "command-stdin",
|
|
23
|
-
command: `echo ${JSON.stringify(artifactJson)}`,
|
|
24
|
-
};
|
|
25
|
-
const input: ParserInput = {
|
|
26
|
-
kind: "buffer",
|
|
27
|
-
buffer: Buffer.from("irrelevant input"),
|
|
28
|
-
};
|
|
29
|
-
|
|
30
|
-
const artifacts = await runParser(def, input, "text/plain");
|
|
31
|
-
expect(artifacts).toHaveLength(1);
|
|
32
|
-
expect(artifacts[0]?.id).toBe("test-1");
|
|
33
|
-
expect(artifacts[0]?.contents[0]?.text).toBe("parsed text");
|
|
34
|
-
});
|
|
35
|
-
|
|
36
|
-
test("command-file runner interpolates FILE_PATH and parses output", async () => {
|
|
37
|
-
const artifactJson = makeArtifactJson();
|
|
38
|
-
// Write a script that outputs the artifact JSON
|
|
39
|
-
const tmpScript = path.join(os.tmpdir(), `struktur-test-${Math.random().toString(16).slice(2)}.sh`);
|
|
40
|
-
await writeFile(tmpScript, `#!/bin/sh\necho '${artifactJson}'`, { mode: 0o755 });
|
|
41
|
-
|
|
42
|
-
try {
|
|
43
|
-
const def: ParserDef = {
|
|
44
|
-
type: "command-file",
|
|
45
|
-
command: `${tmpScript} FILE_PATH`,
|
|
46
|
-
};
|
|
47
|
-
const input: ParserInput = {
|
|
48
|
-
kind: "file",
|
|
49
|
-
path: tmpScript,
|
|
50
|
-
};
|
|
51
|
-
|
|
52
|
-
const artifacts = await runParser(def, input, "text/plain");
|
|
53
|
-
expect(artifacts).toHaveLength(1);
|
|
54
|
-
expect(artifacts[0]?.id).toBe("test-1");
|
|
55
|
-
} finally {
|
|
56
|
-
await rm(tmpScript, { force: true });
|
|
57
|
-
}
|
|
58
|
-
});
|
|
59
|
-
|
|
60
|
-
test("command-file runner writes temp file for buffer input", async () => {
|
|
61
|
-
const artifactJson = makeArtifactJson();
|
|
62
|
-
const tmpScript = path.join(os.tmpdir(), `struktur-test-${Math.random().toString(16).slice(2)}.sh`);
|
|
63
|
-
await writeFile(tmpScript, `#!/bin/sh\necho '${artifactJson}'`, { mode: 0o755 });
|
|
64
|
-
|
|
65
|
-
try {
|
|
66
|
-
const def: ParserDef = {
|
|
67
|
-
type: "command-file",
|
|
68
|
-
command: `${tmpScript} FILE_PATH`,
|
|
69
|
-
};
|
|
70
|
-
const input: ParserInput = {
|
|
71
|
-
kind: "buffer",
|
|
72
|
-
buffer: Buffer.from("some buffer data"),
|
|
73
|
-
};
|
|
74
|
-
|
|
75
|
-
const artifacts = await runParser(def, input, "text/plain");
|
|
76
|
-
expect(artifacts).toHaveLength(1);
|
|
77
|
-
expect(artifacts[0]?.id).toBe("test-1");
|
|
78
|
-
} finally {
|
|
79
|
-
await rm(tmpScript, { force: true });
|
|
80
|
-
}
|
|
81
|
-
});
|
|
82
|
-
|
|
83
|
-
test("npm runner errors when package exports neither function", async () => {
|
|
84
|
-
// This will fail to import, but we test that the error is thrown
|
|
85
|
-
const def: ParserDef = {
|
|
86
|
-
type: "npm",
|
|
87
|
-
package: "this-package-does-not-exist-xyz-abc-123",
|
|
88
|
-
};
|
|
89
|
-
const input: ParserInput = {
|
|
90
|
-
kind: "buffer",
|
|
91
|
-
buffer: Buffer.from("data"),
|
|
92
|
-
};
|
|
93
|
-
|
|
94
|
-
await expect(runParser(def, input, "text/plain")).rejects.toThrow();
|
|
95
|
-
});
|