@struktur/sdk 2.1.1 → 2.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. package/dist/index.js +4111 -0
  2. package/dist/index.js.map +1 -0
  3. package/dist/parsers.js +492 -0
  4. package/dist/parsers.js.map +1 -0
  5. package/dist/strategies.js +2435 -0
  6. package/dist/strategies.js.map +1 -0
  7. package/package.json +25 -13
  8. package/src/agent-cli-integration.test.ts +0 -47
  9. package/src/agent-export.test.ts +0 -17
  10. package/src/agent-tool-labels.test.ts +0 -50
  11. package/src/artifacts/AGENTS.md +0 -16
  12. package/src/artifacts/fileToArtifact.test.ts +0 -37
  13. package/src/artifacts/fileToArtifact.ts +0 -44
  14. package/src/artifacts/input.test.ts +0 -243
  15. package/src/artifacts/input.ts +0 -360
  16. package/src/artifacts/providers.test.ts +0 -19
  17. package/src/artifacts/providers.ts +0 -7
  18. package/src/artifacts/urlToArtifact.test.ts +0 -23
  19. package/src/artifacts/urlToArtifact.ts +0 -19
  20. package/src/auth/AGENTS.md +0 -11
  21. package/src/auth/config.test.ts +0 -132
  22. package/src/auth/config.ts +0 -186
  23. package/src/auth/tokens.test.ts +0 -58
  24. package/src/auth/tokens.ts +0 -229
  25. package/src/chunking/AGENTS.md +0 -11
  26. package/src/chunking/ArtifactBatcher.test.ts +0 -22
  27. package/src/chunking/ArtifactBatcher.ts +0 -110
  28. package/src/chunking/ArtifactSplitter.test.ts +0 -38
  29. package/src/chunking/ArtifactSplitter.ts +0 -151
  30. package/src/debug/AGENTS.md +0 -79
  31. package/src/debug/logger.test.ts +0 -244
  32. package/src/debug/logger.ts +0 -211
  33. package/src/extract.test.ts +0 -22
  34. package/src/extract.ts +0 -150
  35. package/src/fields.test.ts +0 -681
  36. package/src/fields.ts +0 -246
  37. package/src/index.test.ts +0 -20
  38. package/src/index.ts +0 -110
  39. package/src/llm/AGENTS.md +0 -9
  40. package/src/llm/LLMClient.test.ts +0 -394
  41. package/src/llm/LLMClient.ts +0 -264
  42. package/src/llm/RetryingRunner.test.ts +0 -174
  43. package/src/llm/RetryingRunner.ts +0 -270
  44. package/src/llm/message.test.ts +0 -42
  45. package/src/llm/message.ts +0 -47
  46. package/src/llm/models.test.ts +0 -82
  47. package/src/llm/models.ts +0 -190
  48. package/src/llm/resolveModel.ts +0 -86
  49. package/src/merge/AGENTS.md +0 -6
  50. package/src/merge/Deduplicator.test.ts +0 -108
  51. package/src/merge/Deduplicator.ts +0 -45
  52. package/src/merge/SmartDataMerger.test.ts +0 -177
  53. package/src/merge/SmartDataMerger.ts +0 -56
  54. package/src/parsers/AGENTS.md +0 -58
  55. package/src/parsers/collect.test.ts +0 -56
  56. package/src/parsers/collect.ts +0 -31
  57. package/src/parsers/index.ts +0 -6
  58. package/src/parsers/mime.test.ts +0 -91
  59. package/src/parsers/mime.ts +0 -137
  60. package/src/parsers/npm.ts +0 -26
  61. package/src/parsers/pdf.test.ts +0 -394
  62. package/src/parsers/pdf.ts +0 -194
  63. package/src/parsers/runner.test.ts +0 -95
  64. package/src/parsers/runner.ts +0 -177
  65. package/src/parsers/types.ts +0 -29
  66. package/src/prompts/AGENTS.md +0 -8
  67. package/src/prompts/DeduplicationPrompt.test.ts +0 -41
  68. package/src/prompts/DeduplicationPrompt.ts +0 -37
  69. package/src/prompts/ExtractorPrompt.test.ts +0 -21
  70. package/src/prompts/ExtractorPrompt.ts +0 -72
  71. package/src/prompts/ParallelMergerPrompt.test.ts +0 -8
  72. package/src/prompts/ParallelMergerPrompt.ts +0 -37
  73. package/src/prompts/SequentialExtractorPrompt.test.ts +0 -24
  74. package/src/prompts/SequentialExtractorPrompt.ts +0 -82
  75. package/src/prompts/formatArtifacts.test.ts +0 -39
  76. package/src/prompts/formatArtifacts.ts +0 -46
  77. package/src/strategies/AGENTS.md +0 -6
  78. package/src/strategies/DoublePassAutoMergeStrategy.test.ts +0 -53
  79. package/src/strategies/DoublePassAutoMergeStrategy.ts +0 -410
  80. package/src/strategies/DoublePassStrategy.test.ts +0 -48
  81. package/src/strategies/DoublePassStrategy.ts +0 -266
  82. package/src/strategies/ParallelAutoMergeStrategy.test.ts +0 -152
  83. package/src/strategies/ParallelAutoMergeStrategy.ts +0 -345
  84. package/src/strategies/ParallelStrategy.test.ts +0 -61
  85. package/src/strategies/ParallelStrategy.ts +0 -208
  86. package/src/strategies/SequentialAutoMergeStrategy.test.ts +0 -66
  87. package/src/strategies/SequentialAutoMergeStrategy.ts +0 -325
  88. package/src/strategies/SequentialStrategy.test.ts +0 -53
  89. package/src/strategies/SequentialStrategy.ts +0 -142
  90. package/src/strategies/SimpleStrategy.test.ts +0 -46
  91. package/src/strategies/SimpleStrategy.ts +0 -94
  92. package/src/strategies/concurrency.test.ts +0 -16
  93. package/src/strategies/concurrency.ts +0 -14
  94. package/src/strategies/index.test.ts +0 -20
  95. package/src/strategies/index.ts +0 -7
  96. package/src/strategies/utils.test.ts +0 -76
  97. package/src/strategies/utils.ts +0 -95
  98. package/src/tokenization.test.ts +0 -119
  99. package/src/tokenization.ts +0 -71
  100. package/src/types.test.ts +0 -25
  101. package/src/types.ts +0 -174
  102. package/src/validation/AGENTS.md +0 -7
  103. package/src/validation/validator.test.ts +0 -204
  104. package/src/validation/validator.ts +0 -90
  105. package/tsconfig.json +0 -22
@@ -1,394 +0,0 @@
1
- import { test, expect, mock } from "bun:test";
2
-
3
- // ---------------------------------------------------------------------------
4
- // Stub for pdf-parse: we control what getText() and getImage() return so we
5
- // can test parsePdf without a real PDF file.
6
- // ---------------------------------------------------------------------------
7
-
8
- type TextPage = { num: number; text: string };
9
- type EmbeddedImageStub = { dataUrl: string; width: number; height: number };
10
- type PageImagesStub = { pageNumber: number; images: EmbeddedImageStub[] };
11
- type ScreenshotPageStub = { pageNumber: number; dataUrl: string; width: number; height: number };
12
-
13
- // Configurable stubs — tests update these before importing parsePdf.
14
- let stubTextPages: TextPage[] = [];
15
- let stubTextFull = "";
16
- let stubImagePages: PageImagesStub[] = [];
17
- let stubScreenshotPages: ScreenshotPageStub[] = [];
18
- let stubGetImageThrows = false;
19
- let stubGetScreenshotThrows = false;
20
-
21
- mock.module("pdf-parse", () => ({
22
- PDFParse: class {
23
- constructor(_opts: unknown) {}
24
- async getText() {
25
- return {
26
- pages: stubTextPages,
27
- text: stubTextFull,
28
- total: stubTextPages.length || 1,
29
- };
30
- }
31
- async getImage(_params?: unknown) {
32
- if (stubGetImageThrows) throw new Error("image extraction failed");
33
- return {
34
- pages: stubImagePages,
35
- total: stubImagePages.length,
36
- };
37
- }
38
- async getScreenshot(_params?: unknown) {
39
- if (stubGetScreenshotThrows) throw new Error("screenshot rendering failed");
40
- return {
41
- pages: stubScreenshotPages,
42
- total: stubScreenshotPages.length,
43
- };
44
- }
45
- async getInfo() {
46
- return { Title: "Test PDF" };
47
- }
48
- async destroy() {}
49
- },
50
- }));
51
-
52
- // Import after mock is registered
53
- const { parsePdf } = await import("./pdf");
54
-
55
- // ---------------------------------------------------------------------------
56
- // Helpers
57
- // ---------------------------------------------------------------------------
58
-
59
- function makeBuffer() {
60
- return Buffer.from("%PDF-1.4 fake");
61
- }
62
-
63
- // ---------------------------------------------------------------------------
64
- // Tests
65
- // ---------------------------------------------------------------------------
66
-
67
- test("parsePdf extracts per-page text when pages are present", async () => {
68
- stubTextPages = [
69
- { num: 1, text: "Hello page one" },
70
- { num: 2, text: "Hello page two" },
71
- ];
72
- stubTextFull = "Hello page one\nHello page two";
73
- stubImagePages = [];
74
- stubScreenshotPages = [];
75
- stubGetImageThrows = false;
76
- stubGetScreenshotThrows = false;
77
-
78
- const artifact = await parsePdf(makeBuffer());
79
-
80
- expect(artifact.type).toBe("pdf");
81
- expect(artifact.contents).toHaveLength(2);
82
- expect(artifact.contents[0]).toEqual({ page: 1, text: "Hello page one" });
83
- expect(artifact.contents[1]).toEqual({ page: 2, text: "Hello page two" });
84
- });
85
-
86
- test("parsePdf falls back to full text when no pages are returned", async () => {
87
- stubTextPages = [];
88
- stubTextFull = "entire document text";
89
- stubImagePages = [];
90
- stubScreenshotPages = [];
91
- stubGetImageThrows = false;
92
- stubGetScreenshotThrows = false;
93
-
94
- const artifact = await parsePdf(makeBuffer());
95
-
96
- expect(artifact.contents).toHaveLength(1);
97
- expect(artifact.contents[0]?.text).toBe("entire document text");
98
- expect(artifact.contents[0]?.page).toBeUndefined();
99
- });
100
-
101
- test("parsePdf attaches images to the matching page content entry", async () => {
102
- stubTextPages = [
103
- { num: 1, text: "Page with image" },
104
- { num: 2, text: "Page without image" },
105
- ];
106
- stubTextFull = "";
107
- stubImagePages = [
108
- {
109
- pageNumber: 1,
110
- images: [{ dataUrl: "data:image/png;base64,abc123", width: 100, height: 50 }],
111
- },
112
- ];
113
- stubScreenshotPages = [];
114
- stubGetImageThrows = false;
115
- stubGetScreenshotThrows = false;
116
-
117
- const artifact = await parsePdf(makeBuffer());
118
-
119
- expect(artifact.contents).toHaveLength(2);
120
-
121
- const page1 = artifact.contents[0]!;
122
- expect(page1.page).toBe(1);
123
- expect(page1.text).toBe("Page with image");
124
- expect(page1.media).toHaveLength(1);
125
- expect(page1.media![0]).toMatchObject({
126
- type: "image",
127
- base64: "abc123",
128
- width: 100,
129
- height: 50,
130
- });
131
-
132
- const page2 = artifact.contents[1]!;
133
- expect(page2.page).toBe(2);
134
- expect(page2.media).toBeUndefined();
135
- });
136
-
137
- test("parsePdf strips data URL prefix to produce raw base64", async () => {
138
- stubTextPages = [{ num: 1, text: "text" }];
139
- stubTextFull = "";
140
- stubImagePages = [
141
- {
142
- pageNumber: 1,
143
- images: [{ dataUrl: "data:image/jpeg;base64,/9j/4AAQ==", width: 200, height: 200 }],
144
- },
145
- ];
146
- stubScreenshotPages = [];
147
- stubGetImageThrows = false;
148
- stubGetScreenshotThrows = false;
149
-
150
- const artifact = await parsePdf(makeBuffer());
151
- const img = artifact.contents[0]?.media?.[0];
152
- expect(img?.base64).toBe("/9j/4AAQ==");
153
- });
154
-
155
- test("parsePdf creates a content entry for pages that have only images (no text)", async () => {
156
- stubTextPages = [{ num: 1, text: "text only page" }];
157
- stubTextFull = "";
158
- stubImagePages = [
159
- {
160
- pageNumber: 2,
161
- images: [{ dataUrl: "data:image/png;base64,img2", width: 80, height: 80 }],
162
- },
163
- ];
164
- stubScreenshotPages = [];
165
- stubGetImageThrows = false;
166
- stubGetScreenshotThrows = false;
167
-
168
- const artifact = await parsePdf(makeBuffer());
169
-
170
- // Should have page 1 (text) and page 2 (image-only)
171
- expect(artifact.contents).toHaveLength(2);
172
-
173
- const imagePage = artifact.contents.find((c) => c.page === 2);
174
- expect(imagePage).toBeDefined();
175
- expect(imagePage?.text).toBeUndefined();
176
- expect(imagePage?.media).toHaveLength(1);
177
- expect(imagePage?.media![0]?.base64).toBe("img2");
178
- });
179
-
180
- test("parsePdf continues without images when getImage() throws", async () => {
181
- stubTextPages = [{ num: 1, text: "resilient page" }];
182
- stubTextFull = "";
183
- stubImagePages = [];
184
- stubScreenshotPages = [];
185
- stubGetImageThrows = true;
186
- stubGetScreenshotThrows = false;
187
-
188
- const artifact = await parsePdf(makeBuffer());
189
-
190
- expect(artifact.contents).toHaveLength(1);
191
- expect(artifact.contents[0]?.text).toBe("resilient page");
192
- expect(artifact.contents[0]?.media).toBeUndefined();
193
- });
194
-
195
- test("parsePdf produces at least one content entry for empty documents", async () => {
196
- stubTextPages = [];
197
- stubTextFull = "";
198
- stubImagePages = [];
199
- stubScreenshotPages = [];
200
- stubGetImageThrows = false;
201
- stubGetScreenshotThrows = false;
202
-
203
- const artifact = await parsePdf(makeBuffer());
204
-
205
- expect(artifact.contents).toHaveLength(1);
206
- expect(artifact.contents[0]?.text).toBe("");
207
- });
208
-
209
- test("parsePdf includes numpages in metadata", async () => {
210
- stubTextPages = [{ num: 1, text: "one" }];
211
- stubTextFull = "";
212
- stubImagePages = [];
213
- stubScreenshotPages = [];
214
- stubGetImageThrows = false;
215
- stubGetScreenshotThrows = false;
216
-
217
- const artifact = await parsePdf(makeBuffer());
218
-
219
- expect(artifact.metadata?.numpages).toBe(1);
220
- expect((artifact.metadata?.info as Record<string, unknown>)?.Title).toBe("Test PDF");
221
- });
222
-
223
- test("parsePdf raw() returns the original buffer", async () => {
224
- stubTextPages = [{ num: 1, text: "raw test" }];
225
- stubTextFull = "";
226
- stubImagePages = [];
227
- stubScreenshotPages = [];
228
- stubGetImageThrows = false;
229
- stubGetScreenshotThrows = false;
230
-
231
- const buf = makeBuffer();
232
- const artifact = await parsePdf(buf);
233
- const raw = await artifact.raw();
234
- expect(raw).toBe(buf);
235
- });
236
-
237
- test("parsePdf with includeImages: false skips image extraction", async () => {
238
- stubTextPages = [{ num: 1, text: "text only" }];
239
- stubTextFull = "";
240
- stubImagePages = [
241
- {
242
- pageNumber: 1,
243
- images: [{ dataUrl: "data:image/png;base64,abc123", width: 100, height: 50 }],
244
- },
245
- ];
246
- stubScreenshotPages = [];
247
- stubGetImageThrows = false;
248
- stubGetScreenshotThrows = false;
249
-
250
- // Even though stubImagePages has data, passing includeImages: false should
251
- // cause getImage() to not be called, so no media on the content entry.
252
- const artifact = await parsePdf(makeBuffer(), { includeImages: false });
253
-
254
- expect(artifact.contents).toHaveLength(1);
255
- expect(artifact.contents[0]?.text).toBe("text only");
256
- expect(artifact.contents[0]?.media).toBeUndefined();
257
- });
258
-
259
- test("parsePdf with includeImages: true behaves the same as the default", async () => {
260
- stubTextPages = [{ num: 1, text: "with images" }];
261
- stubTextFull = "";
262
- stubImagePages = [
263
- {
264
- pageNumber: 1,
265
- images: [{ dataUrl: "data:image/png;base64,xyz456", width: 80, height: 80 }],
266
- },
267
- ];
268
- stubScreenshotPages = [];
269
- stubGetImageThrows = false;
270
- stubGetScreenshotThrows = false;
271
-
272
- const artifact = await parsePdf(makeBuffer(), { includeImages: true });
273
-
274
- expect(artifact.contents[0]?.media).toHaveLength(1);
275
- expect(artifact.contents[0]?.media![0]?.base64).toBe("xyz456");
276
- });
277
-
278
- test("parsePdf with no options still includes images by default", async () => {
279
- stubTextPages = [{ num: 1, text: "default" }];
280
- stubTextFull = "";
281
- stubImagePages = [
282
- {
283
- pageNumber: 1,
284
- images: [{ dataUrl: "data:image/png;base64,def789", width: 60, height: 60 }],
285
- },
286
- ];
287
- stubScreenshotPages = [];
288
- stubGetImageThrows = false;
289
- stubGetScreenshotThrows = false;
290
-
291
- const artifact = await parsePdf(makeBuffer());
292
-
293
- expect(artifact.contents[0]?.media).toHaveLength(1);
294
- expect(artifact.contents[0]?.media![0]?.base64).toBe("def789");
295
- });
296
-
297
- test("parsePdf marks embedded images with imageType: 'embedded'", async () => {
298
- stubTextPages = [{ num: 1, text: "page with embedded image" }];
299
- stubTextFull = "";
300
- stubImagePages = [
301
- {
302
- pageNumber: 1,
303
- images: [{ dataUrl: "data:image/png;base64,embedded123", width: 100, height: 100 }],
304
- },
305
- ];
306
- stubScreenshotPages = [];
307
- stubGetImageThrows = false;
308
- stubGetScreenshotThrows = false;
309
-
310
- const artifact = await parsePdf(makeBuffer());
311
-
312
- expect(artifact.contents[0]?.media).toHaveLength(1);
313
- expect(artifact.contents[0]?.media![0]?.imageType).toBe("embedded");
314
- });
315
-
316
- test("parsePdf marks screenshots with imageType: 'screenshot'", async () => {
317
- stubTextPages = [{ num: 1, text: "page with screenshot" }];
318
- stubTextFull = "";
319
- stubImagePages = [];
320
- stubScreenshotPages = [
321
- { pageNumber: 1, dataUrl: "data:image/png;base64,screenshot456", width: 800, height: 600 },
322
- ];
323
- stubGetImageThrows = false;
324
- stubGetScreenshotThrows = false;
325
-
326
- const artifact = await parsePdf(makeBuffer(), { screenshots: true });
327
-
328
- expect(artifact.contents[0]?.media).toHaveLength(1);
329
- expect(artifact.contents[0]?.media![0]?.imageType).toBe("screenshot");
330
- });
331
-
332
- test("parsePdf correctly differentiates embedded images and screenshots on the same page", async () => {
333
- stubTextPages = [{ num: 1, text: "page with both" }];
334
- stubTextFull = "";
335
- stubImagePages = [
336
- {
337
- pageNumber: 1,
338
- images: [{ dataUrl: "data:image/png;base64,embedded789", width: 100, height: 100 }],
339
- },
340
- ];
341
- stubScreenshotPages = [
342
- { pageNumber: 1, dataUrl: "data:image/png;base64,screenshot012", width: 800, height: 600 },
343
- ];
344
- stubGetImageThrows = false;
345
- stubGetScreenshotThrows = false;
346
-
347
- const artifact = await parsePdf(makeBuffer(), { screenshots: true });
348
-
349
- expect(artifact.contents[0]?.media).toHaveLength(2);
350
-
351
- const embeddedImage = artifact.contents[0]?.media?.find(img => img.imageType === "embedded");
352
- const screenshotImage = artifact.contents[0]?.media?.find(img => img.imageType === "screenshot");
353
-
354
- expect(embeddedImage).toBeDefined();
355
- expect(embeddedImage?.base64).toBe("embedded789");
356
- expect(screenshotImage).toBeDefined();
357
- expect(screenshotImage?.base64).toBe("screenshot012");
358
- });
359
-
360
- test("parsePdf without screenshots option does not include screenshot images", async () => {
361
- stubTextPages = [{ num: 1, text: "page" }];
362
- stubTextFull = "";
363
- stubImagePages = [
364
- {
365
- pageNumber: 1,
366
- images: [{ dataUrl: "data:image/png;base64,embedded345", width: 100, height: 100 }],
367
- },
368
- ];
369
- stubScreenshotPages = [
370
- { pageNumber: 1, dataUrl: "data:image/png;base64,screenshot678", width: 800, height: 600 },
371
- ];
372
- stubGetImageThrows = false;
373
- stubGetScreenshotThrows = false;
374
-
375
- const artifact = await parsePdf(makeBuffer());
376
-
377
- expect(artifact.contents[0]?.media).toHaveLength(1);
378
- expect(artifact.contents[0]?.media![0]?.imageType).toBe("embedded");
379
- });
380
-
381
- test("parsePdf continues without screenshots when getScreenshot() throws", async () => {
382
- stubTextPages = [{ num: 1, text: "resilient page" }];
383
- stubTextFull = "";
384
- stubImagePages = [];
385
- stubScreenshotPages = [];
386
- stubGetImageThrows = false;
387
- stubGetScreenshotThrows = true;
388
-
389
- const artifact = await parsePdf(makeBuffer(), { screenshots: true });
390
-
391
- expect(artifact.contents).toHaveLength(1);
392
- expect(artifact.contents[0]?.text).toBe("resilient page");
393
- expect(artifact.contents[0]?.media).toBeUndefined();
394
- });
@@ -1,194 +0,0 @@
1
- import type { Artifact, ArtifactContent, ArtifactImage } from "../types";
2
- import { collectStream } from "./collect";
3
-
4
- export type ParsePdfOptions = {
5
- /**
6
- * Whether to extract embedded images from each page and include them as
7
- * base64-encoded ArtifactImage entries in the media field.
8
- * Defaults to true. Pass false to skip image extraction entirely.
9
- */
10
- includeImages?: boolean;
11
- /**
12
- * Whether to render page screenshots and include them as ArtifactImage entries.
13
- * When true, each page is rendered to a PNG image and added to the media field.
14
- * Defaults to false.
15
- */
16
- screenshots?: boolean;
17
- /**
18
- * Scale factor for screenshots. Higher values produce larger, higher-quality images.
19
- * Defaults to 1.5.
20
- */
21
- screenshotScale?: number;
22
- /**
23
- * Target width in pixels for screenshots. If specified, takes precedence over screenshotScale.
24
- * Height is calculated to maintain aspect ratio.
25
- */
26
- screenshotWidth?: number;
27
- };
28
-
29
- /**
30
- * Built-in PDF parser using pdf-parse.
31
- *
32
- * Accepts a Buffer or ReadableStream<Uint8Array> and extracts text per-page
33
- * into ArtifactContent[] with page numbers set. Embedded images on each page
34
- * are extracted and included as base64-encoded ArtifactImage entries in the
35
- * media field of the corresponding content block (unless includeImages is
36
- * false). Returns an Artifact with type: "pdf".
37
- */
38
- export async function parsePdf(
39
- input: Buffer | ReadableStream<Uint8Array>,
40
- options?: ParsePdfOptions,
41
- ): Promise<Artifact> {
42
- const buffer = Buffer.isBuffer(input) ? input : await collectStream(input);
43
-
44
- // Dynamic import to avoid bundling issues
45
- const { PDFParse } = await import("pdf-parse");
46
-
47
- const parser = new PDFParse({ data: buffer });
48
- const textResult = await parser.getText();
49
-
50
- // Build a page-number → text map from per-page results
51
- const pageTextMap = new Map<number, string>();
52
- if (textResult.pages.length > 0) {
53
- for (const page of textResult.pages) {
54
- if (page.text && page.text.trim().length > 0) {
55
- pageTextMap.set(page.num, page.text);
56
- }
57
- }
58
- }
59
-
60
- // Extract embedded images unless the caller opted out.
61
- // imageBuffer=false saves memory (we only need the data URL).
62
- let imageResult;
63
- if (options?.includeImages !== false) {
64
- try {
65
- imageResult = await parser.getImage({ imageBuffer: false, imageDataUrl: true });
66
- } catch {
67
- // Image extraction is optional — continue without images if it fails
68
- }
69
- }
70
-
71
- // Render page screenshots if requested
72
- let screenshotResult;
73
- if (options?.screenshots === true) {
74
- try {
75
- const screenshotParams: {
76
- imageBuffer: boolean;
77
- imageDataUrl: boolean;
78
- scale?: number;
79
- desiredWidth?: number;
80
- } = { imageBuffer: false, imageDataUrl: true };
81
-
82
- if (options.screenshotWidth !== undefined) {
83
- screenshotParams.desiredWidth = options.screenshotWidth;
84
- } else {
85
- screenshotParams.scale = options.screenshotScale ?? 1.5;
86
- }
87
-
88
- screenshotResult = await parser.getScreenshot(screenshotParams);
89
- } catch {
90
- // Screenshot rendering is optional — continue without screenshots if it fails
91
- }
92
- }
93
-
94
- // Build a page-number → ArtifactImage[] map from extracted images
95
- const pageImageMap = new Map<number, ArtifactImage[]>();
96
- if (imageResult) {
97
- for (const pageImages of imageResult.pages) {
98
- const artifactImages: ArtifactImage[] = pageImages.images
99
- .filter((img) => img.dataUrl)
100
- .map((img) => {
101
- // Strip the "data:<mime>;base64," prefix to get the raw base64 string
102
- const base64 = img.dataUrl.replace(/^data:[^;]+;base64,/, "");
103
- const artifactImage: ArtifactImage = {
104
- type: "image",
105
- base64,
106
- width: img.width,
107
- height: img.height,
108
- imageType: "embedded",
109
- };
110
- return artifactImage;
111
- });
112
- if (artifactImages.length > 0) {
113
- pageImageMap.set(pageImages.pageNumber, artifactImages);
114
- }
115
- }
116
- }
117
-
118
- // Add screenshots to the pageImageMap
119
- if (screenshotResult) {
120
- for (const screenshot of screenshotResult.pages) {
121
- if (screenshot.dataUrl) {
122
- // Strip the "data:<mime>;base64," prefix to get the raw base64 string
123
- const base64 = screenshot.dataUrl.replace(/^data:[^;]+;base64,/, "");
124
- const artifactImage: ArtifactImage = {
125
- type: "image",
126
- base64,
127
- width: screenshot.width,
128
- height: screenshot.height,
129
- imageType: "screenshot",
130
- };
131
- // Append to existing images for this page, or create new entry
132
- const existing = pageImageMap.get(screenshot.pageNumber) ?? [];
133
- pageImageMap.set(screenshot.pageNumber, [...existing, artifactImage]);
134
- }
135
- }
136
- }
137
-
138
- let contents: ArtifactContent[];
139
-
140
- if (textResult.pages.length > 0) {
141
- // Collect all page numbers that have text or images
142
- const allPageNums = new Set<number>([
143
- ...pageTextMap.keys(),
144
- ...pageImageMap.keys(),
145
- ]);
146
-
147
- contents = Array.from(allPageNums)
148
- .sort((a, b) => a - b)
149
- .map((pageNum) => {
150
- const entry: ArtifactContent = { page: pageNum };
151
- const text = pageTextMap.get(pageNum);
152
- if (text) entry.text = text;
153
- const media = pageImageMap.get(pageNum);
154
- if (media) entry.media = media;
155
- return entry;
156
- });
157
- } else {
158
- // Fallback: no per-page info — use full concatenated text
159
- const entry: ArtifactContent = { text: textResult.text };
160
- // Attach any images from the first page if available
161
- const firstPageImages = pageImageMap.size > 0
162
- ? pageImageMap.values().next().value
163
- : undefined;
164
- if (firstPageImages) entry.media = firstPageImages;
165
- contents = [entry];
166
- }
167
-
168
- // Ensure we have at least one content entry
169
- if (contents.length === 0) {
170
- contents = [{ text: "" }];
171
- }
172
-
173
- let infoResult;
174
- try {
175
- infoResult = await parser.getInfo();
176
- } catch {
177
- // Info extraction is optional
178
- }
179
-
180
- await parser.destroy();
181
-
182
- return {
183
- id: `artifact-${crypto.randomUUID()}`,
184
- type: "pdf",
185
- raw: async () => buffer,
186
- contents,
187
- metadata: infoResult
188
- ? {
189
- numpages: textResult.total,
190
- info: infoResult,
191
- }
192
- : { numpages: textResult.total },
193
- };
194
- }
@@ -1,95 +0,0 @@
1
- import { test, expect } from "bun:test";
2
- import { runParser } from "./runner";
3
- import type { ParserDef, ParserInput } from "./types";
4
- import path from "node:path";
5
- import os from "node:os";
6
- import { rm, writeFile } from "node:fs/promises";
7
-
8
- // command-file: echo a serialized artifact JSON
9
- const makeArtifactJson = () => {
10
- const artifact = JSON.stringify([{
11
- id: "test-1",
12
- type: "text",
13
- contents: [{ text: "parsed text" }],
14
- }]);
15
- return artifact;
16
- };
17
-
18
- test("command-stdin runner pipes buffer to stdin and parses output", async () => {
19
- // Use a command that outputs valid artifact JSON ignoring stdin
20
- const artifactJson = makeArtifactJson();
21
- const def: ParserDef = {
22
- type: "command-stdin",
23
- command: `echo ${JSON.stringify(artifactJson)}`,
24
- };
25
- const input: ParserInput = {
26
- kind: "buffer",
27
- buffer: Buffer.from("irrelevant input"),
28
- };
29
-
30
- const artifacts = await runParser(def, input, "text/plain");
31
- expect(artifacts).toHaveLength(1);
32
- expect(artifacts[0]?.id).toBe("test-1");
33
- expect(artifacts[0]?.contents[0]?.text).toBe("parsed text");
34
- });
35
-
36
- test("command-file runner interpolates FILE_PATH and parses output", async () => {
37
- const artifactJson = makeArtifactJson();
38
- // Write a script that outputs the artifact JSON
39
- const tmpScript = path.join(os.tmpdir(), `struktur-test-${Math.random().toString(16).slice(2)}.sh`);
40
- await writeFile(tmpScript, `#!/bin/sh\necho '${artifactJson}'`, { mode: 0o755 });
41
-
42
- try {
43
- const def: ParserDef = {
44
- type: "command-file",
45
- command: `${tmpScript} FILE_PATH`,
46
- };
47
- const input: ParserInput = {
48
- kind: "file",
49
- path: tmpScript,
50
- };
51
-
52
- const artifacts = await runParser(def, input, "text/plain");
53
- expect(artifacts).toHaveLength(1);
54
- expect(artifacts[0]?.id).toBe("test-1");
55
- } finally {
56
- await rm(tmpScript, { force: true });
57
- }
58
- });
59
-
60
- test("command-file runner writes temp file for buffer input", async () => {
61
- const artifactJson = makeArtifactJson();
62
- const tmpScript = path.join(os.tmpdir(), `struktur-test-${Math.random().toString(16).slice(2)}.sh`);
63
- await writeFile(tmpScript, `#!/bin/sh\necho '${artifactJson}'`, { mode: 0o755 });
64
-
65
- try {
66
- const def: ParserDef = {
67
- type: "command-file",
68
- command: `${tmpScript} FILE_PATH`,
69
- };
70
- const input: ParserInput = {
71
- kind: "buffer",
72
- buffer: Buffer.from("some buffer data"),
73
- };
74
-
75
- const artifacts = await runParser(def, input, "text/plain");
76
- expect(artifacts).toHaveLength(1);
77
- expect(artifacts[0]?.id).toBe("test-1");
78
- } finally {
79
- await rm(tmpScript, { force: true });
80
- }
81
- });
82
-
83
- test("npm runner errors when package exports neither function", async () => {
84
- // This will fail to import, but we test that the error is thrown
85
- const def: ParserDef = {
86
- type: "npm",
87
- package: "this-package-does-not-exist-xyz-abc-123",
88
- };
89
- const input: ParserInput = {
90
- kind: "buffer",
91
- buffer: Buffer.from("data"),
92
- };
93
-
94
- await expect(runParser(def, input, "text/plain")).rejects.toThrow();
95
- });