@struktur/sdk 2.1.2 → 2.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (200) hide show
  1. package/dist/artifacts/fileToArtifact.d.ts +8 -0
  2. package/dist/artifacts/fileToArtifact.d.ts.map +1 -0
  3. package/dist/artifacts/input.d.ts +60 -0
  4. package/dist/artifacts/input.d.ts.map +1 -0
  5. package/{src/artifacts/providers.ts → dist/artifacts/providers.d.ts} +2 -4
  6. package/dist/artifacts/providers.d.ts.map +1 -0
  7. package/dist/artifacts/urlToArtifact.d.ts +3 -0
  8. package/dist/artifacts/urlToArtifact.d.ts.map +1 -0
  9. package/dist/auth/config.d.ts +34 -0
  10. package/dist/auth/config.d.ts.map +1 -0
  11. package/dist/auth/tokens.d.ts +18 -0
  12. package/dist/auth/tokens.d.ts.map +1 -0
  13. package/dist/chunking/ArtifactBatcher.d.ts +11 -0
  14. package/dist/chunking/ArtifactBatcher.d.ts.map +1 -0
  15. package/dist/chunking/ArtifactSplitter.d.ts +10 -0
  16. package/dist/chunking/ArtifactSplitter.d.ts.map +1 -0
  17. package/dist/debug/logger.d.ts +169 -0
  18. package/dist/debug/logger.d.ts.map +1 -0
  19. package/dist/extract.d.ts +3 -0
  20. package/dist/extract.d.ts.map +1 -0
  21. package/dist/fields.d.ts +75 -0
  22. package/dist/fields.d.ts.map +1 -0
  23. package/dist/index.d.ts +24 -0
  24. package/dist/index.d.ts.map +1 -0
  25. package/dist/index.js +5603 -0
  26. package/dist/index.js.map +1 -0
  27. package/dist/llm/LLMClient.d.ts +40 -0
  28. package/dist/llm/LLMClient.d.ts.map +1 -0
  29. package/dist/llm/RetryingRunner.d.ts +37 -0
  30. package/dist/llm/RetryingRunner.d.ts.map +1 -0
  31. package/dist/llm/message.d.ts +12 -0
  32. package/dist/llm/message.d.ts.map +1 -0
  33. package/dist/llm/models.d.ts +13 -0
  34. package/dist/llm/models.d.ts.map +1 -0
  35. package/dist/llm/resolveModel.d.ts +3 -0
  36. package/dist/llm/resolveModel.d.ts.map +1 -0
  37. package/dist/merge/Deduplicator.d.ts +4 -0
  38. package/dist/merge/Deduplicator.d.ts.map +1 -0
  39. package/dist/merge/SmartDataMerger.d.ts +7 -0
  40. package/dist/merge/SmartDataMerger.d.ts.map +1 -0
  41. package/dist/parsers/collect.d.ts +7 -0
  42. package/dist/parsers/collect.d.ts.map +1 -0
  43. package/{src/parsers/index.ts → dist/parsers/index.d.ts} +1 -0
  44. package/dist/parsers/index.d.ts.map +1 -0
  45. package/dist/parsers/mime.d.ts +12 -0
  46. package/dist/parsers/mime.d.ts.map +1 -0
  47. package/dist/parsers/npm.d.ts +16 -0
  48. package/dist/parsers/npm.d.ts.map +1 -0
  49. package/dist/parsers/pdf.d.ts +36 -0
  50. package/dist/parsers/pdf.d.ts.map +1 -0
  51. package/dist/parsers/runner.d.ts +4 -0
  52. package/dist/parsers/runner.d.ts.map +1 -0
  53. package/dist/parsers/types.d.ts +27 -0
  54. package/dist/parsers/types.d.ts.map +1 -0
  55. package/dist/parsers.d.ts +1 -0
  56. package/dist/parsers.js +492 -0
  57. package/dist/parsers.js.map +1 -0
  58. package/dist/prompts/DeduplicationPrompt.d.ts +5 -0
  59. package/dist/prompts/DeduplicationPrompt.d.ts.map +1 -0
  60. package/dist/prompts/ExtractorPrompt.d.ts +6 -0
  61. package/dist/prompts/ExtractorPrompt.d.ts.map +1 -0
  62. package/dist/prompts/ParallelMergerPrompt.d.ts +5 -0
  63. package/dist/prompts/ParallelMergerPrompt.d.ts.map +1 -0
  64. package/dist/prompts/SequentialExtractorPrompt.d.ts +6 -0
  65. package/dist/prompts/SequentialExtractorPrompt.d.ts.map +1 -0
  66. package/dist/prompts/formatArtifacts.d.ts +3 -0
  67. package/dist/prompts/formatArtifacts.d.ts.map +1 -0
  68. package/dist/strategies/DoublePassAutoMergeStrategy.d.ts +23 -0
  69. package/dist/strategies/DoublePassAutoMergeStrategy.d.ts.map +1 -0
  70. package/dist/strategies/DoublePassStrategy.d.ts +22 -0
  71. package/dist/strategies/DoublePassStrategy.d.ts.map +1 -0
  72. package/dist/strategies/ParallelAutoMergeStrategy.d.ts +27 -0
  73. package/dist/strategies/ParallelAutoMergeStrategy.d.ts.map +1 -0
  74. package/dist/strategies/ParallelStrategy.d.ts +22 -0
  75. package/dist/strategies/ParallelStrategy.d.ts.map +1 -0
  76. package/dist/strategies/SequentialAutoMergeStrategy.d.ts +22 -0
  77. package/dist/strategies/SequentialAutoMergeStrategy.d.ts.map +1 -0
  78. package/dist/strategies/SequentialStrategy.d.ts +20 -0
  79. package/dist/strategies/SequentialStrategy.d.ts.map +1 -0
  80. package/dist/strategies/SimpleStrategy.d.ts +18 -0
  81. package/dist/strategies/SimpleStrategy.d.ts.map +1 -0
  82. package/dist/strategies/agent/AgentStrategy.d.ts +44 -0
  83. package/dist/strategies/agent/AgentStrategy.d.ts.map +1 -0
  84. package/dist/strategies/agent/AgentTools.d.ts +55 -0
  85. package/dist/strategies/agent/AgentTools.d.ts.map +1 -0
  86. package/dist/strategies/agent/ArtifactFilesystem.d.ts +51 -0
  87. package/dist/strategies/agent/ArtifactFilesystem.d.ts.map +1 -0
  88. package/dist/strategies/agent/index.d.ts +4 -0
  89. package/dist/strategies/agent/index.d.ts.map +1 -0
  90. package/dist/strategies/concurrency.d.ts +2 -0
  91. package/dist/strategies/concurrency.d.ts.map +1 -0
  92. package/{src/strategies/index.ts → dist/strategies/index.d.ts} +2 -0
  93. package/dist/strategies/index.d.ts.map +1 -0
  94. package/dist/strategies/utils.d.ts +39 -0
  95. package/dist/strategies/utils.d.ts.map +1 -0
  96. package/dist/strategies.d.ts +1 -0
  97. package/dist/strategies.js +3930 -0
  98. package/dist/strategies.js.map +1 -0
  99. package/dist/tokenization.d.ts +11 -0
  100. package/dist/tokenization.d.ts.map +1 -0
  101. package/dist/types.d.ts +178 -0
  102. package/dist/types.d.ts.map +1 -0
  103. package/dist/validation/validator.d.ts +20 -0
  104. package/dist/validation/validator.d.ts.map +1 -0
  105. package/package.json +30 -14
  106. package/src/agent-cli-integration.test.ts +0 -47
  107. package/src/agent-export.test.ts +0 -17
  108. package/src/agent-tool-labels.test.ts +0 -50
  109. package/src/artifacts/AGENTS.md +0 -16
  110. package/src/artifacts/fileToArtifact.test.ts +0 -37
  111. package/src/artifacts/fileToArtifact.ts +0 -44
  112. package/src/artifacts/input.test.ts +0 -243
  113. package/src/artifacts/input.ts +0 -360
  114. package/src/artifacts/providers.test.ts +0 -19
  115. package/src/artifacts/urlToArtifact.test.ts +0 -23
  116. package/src/artifacts/urlToArtifact.ts +0 -19
  117. package/src/auth/AGENTS.md +0 -11
  118. package/src/auth/config.test.ts +0 -132
  119. package/src/auth/config.ts +0 -186
  120. package/src/auth/tokens.test.ts +0 -58
  121. package/src/auth/tokens.ts +0 -229
  122. package/src/chunking/AGENTS.md +0 -11
  123. package/src/chunking/ArtifactBatcher.test.ts +0 -22
  124. package/src/chunking/ArtifactBatcher.ts +0 -110
  125. package/src/chunking/ArtifactSplitter.test.ts +0 -38
  126. package/src/chunking/ArtifactSplitter.ts +0 -151
  127. package/src/debug/AGENTS.md +0 -79
  128. package/src/debug/logger.test.ts +0 -244
  129. package/src/debug/logger.ts +0 -211
  130. package/src/extract.test.ts +0 -22
  131. package/src/extract.ts +0 -150
  132. package/src/fields.test.ts +0 -681
  133. package/src/fields.ts +0 -246
  134. package/src/index.test.ts +0 -20
  135. package/src/index.ts +0 -110
  136. package/src/llm/AGENTS.md +0 -9
  137. package/src/llm/LLMClient.test.ts +0 -394
  138. package/src/llm/LLMClient.ts +0 -264
  139. package/src/llm/RetryingRunner.test.ts +0 -174
  140. package/src/llm/RetryingRunner.ts +0 -270
  141. package/src/llm/message.test.ts +0 -42
  142. package/src/llm/message.ts +0 -47
  143. package/src/llm/models.test.ts +0 -82
  144. package/src/llm/models.ts +0 -190
  145. package/src/llm/resolveModel.ts +0 -86
  146. package/src/merge/AGENTS.md +0 -6
  147. package/src/merge/Deduplicator.test.ts +0 -108
  148. package/src/merge/Deduplicator.ts +0 -45
  149. package/src/merge/SmartDataMerger.test.ts +0 -177
  150. package/src/merge/SmartDataMerger.ts +0 -56
  151. package/src/parsers/AGENTS.md +0 -58
  152. package/src/parsers/collect.test.ts +0 -56
  153. package/src/parsers/collect.ts +0 -31
  154. package/src/parsers/mime.test.ts +0 -91
  155. package/src/parsers/mime.ts +0 -137
  156. package/src/parsers/npm.ts +0 -26
  157. package/src/parsers/pdf.test.ts +0 -394
  158. package/src/parsers/pdf.ts +0 -194
  159. package/src/parsers/runner.test.ts +0 -95
  160. package/src/parsers/runner.ts +0 -177
  161. package/src/parsers/types.ts +0 -29
  162. package/src/prompts/AGENTS.md +0 -8
  163. package/src/prompts/DeduplicationPrompt.test.ts +0 -41
  164. package/src/prompts/DeduplicationPrompt.ts +0 -37
  165. package/src/prompts/ExtractorPrompt.test.ts +0 -21
  166. package/src/prompts/ExtractorPrompt.ts +0 -72
  167. package/src/prompts/ParallelMergerPrompt.test.ts +0 -8
  168. package/src/prompts/ParallelMergerPrompt.ts +0 -37
  169. package/src/prompts/SequentialExtractorPrompt.test.ts +0 -24
  170. package/src/prompts/SequentialExtractorPrompt.ts +0 -82
  171. package/src/prompts/formatArtifacts.test.ts +0 -39
  172. package/src/prompts/formatArtifacts.ts +0 -46
  173. package/src/strategies/AGENTS.md +0 -6
  174. package/src/strategies/DoublePassAutoMergeStrategy.test.ts +0 -53
  175. package/src/strategies/DoublePassAutoMergeStrategy.ts +0 -410
  176. package/src/strategies/DoublePassStrategy.test.ts +0 -48
  177. package/src/strategies/DoublePassStrategy.ts +0 -266
  178. package/src/strategies/ParallelAutoMergeStrategy.test.ts +0 -152
  179. package/src/strategies/ParallelAutoMergeStrategy.ts +0 -345
  180. package/src/strategies/ParallelStrategy.test.ts +0 -61
  181. package/src/strategies/ParallelStrategy.ts +0 -208
  182. package/src/strategies/SequentialAutoMergeStrategy.test.ts +0 -66
  183. package/src/strategies/SequentialAutoMergeStrategy.ts +0 -325
  184. package/src/strategies/SequentialStrategy.test.ts +0 -53
  185. package/src/strategies/SequentialStrategy.ts +0 -142
  186. package/src/strategies/SimpleStrategy.test.ts +0 -46
  187. package/src/strategies/SimpleStrategy.ts +0 -94
  188. package/src/strategies/concurrency.test.ts +0 -16
  189. package/src/strategies/concurrency.ts +0 -14
  190. package/src/strategies/index.test.ts +0 -20
  191. package/src/strategies/utils.test.ts +0 -76
  192. package/src/strategies/utils.ts +0 -95
  193. package/src/tokenization.test.ts +0 -119
  194. package/src/tokenization.ts +0 -71
  195. package/src/types.test.ts +0 -25
  196. package/src/types.ts +0 -174
  197. package/src/validation/AGENTS.md +0 -7
  198. package/src/validation/validator.test.ts +0 -204
  199. package/src/validation/validator.ts +0 -90
  200. package/tsconfig.json +0 -22
@@ -1,177 +0,0 @@
1
- import os from "node:os";
2
- import path from "node:path";
3
- import { rm, writeFile } from "node:fs/promises";
4
- import type { Artifact } from "../types";
5
- import type { ParserDef, ParserInput } from "./types";
6
- import type { NpmParserModule } from "./npm";
7
- import {
8
- hydrateSerializedArtifacts,
9
- validateSerializedArtifacts,
10
- } from "../artifacts/input";
11
-
12
- const parseCommandOutput = (stdout: string): Artifact[] => {
13
- let parsed: unknown;
14
- try {
15
- parsed = JSON.parse(stdout);
16
- } catch (error) {
17
- const message = error instanceof Error ? error.message : String(error);
18
- throw new Error(`Parser command produced invalid JSON: ${message}\nOutput: ${stdout.slice(0, 200)}`);
19
- }
20
- const serialized = validateSerializedArtifacts(parsed);
21
- return hydrateSerializedArtifacts(serialized);
22
- };
23
-
24
- const spawnAndCapture = async (command: string, stdinBuffer?: Buffer): Promise<string> => {
25
- if (!command.trim()) {
26
- throw new Error(`Empty command: ${command}`);
27
- }
28
-
29
- const proc = Bun.spawn(["sh", "-c", command], {
30
- stdout: "pipe",
31
- stderr: "pipe",
32
- stdin: stdinBuffer ? "pipe" : "ignore",
33
- });
34
-
35
- if (stdinBuffer && proc.stdin) {
36
- // Bun's FileSink uses write/end, not the WritableStream API
37
- const sink = proc.stdin as { write: (data: Uint8Array) => void; end: () => void };
38
- sink.write(stdinBuffer);
39
- sink.end();
40
- }
41
-
42
- const [stdout, stderr, exitCode] = await Promise.all([
43
- new Response(proc.stdout).text(),
44
- new Response(proc.stderr).text(),
45
- proc.exited,
46
- ]);
47
-
48
- if (exitCode !== 0) {
49
- throw new Error(
50
- `Parser command exited with code ${exitCode}: ${command}\nStderr: ${stderr.slice(0, 500)}`
51
- );
52
- }
53
-
54
- return stdout;
55
- };
56
-
57
- const runNpmParser = async (
58
- pkg: string,
59
- input: ParserInput,
60
- mimeType: string,
61
- ): Promise<Artifact[]> => {
62
- const mod = (await import(pkg)) as NpmParserModule;
63
-
64
- const hasParseFile = typeof mod.parseFile === "function";
65
- const hasParseStream = typeof mod.parseStream === "function";
66
-
67
- if (!hasParseFile && !hasParseStream) {
68
- throw new Error(
69
- `npm parser package "${pkg}" exports neither parseFile nor parseStream`
70
- );
71
- }
72
-
73
- if (input.kind === "file") {
74
- // Prefer parseFile for zero-copy
75
- if (hasParseFile) {
76
- return mod.parseFile!(input.path, mimeType);
77
- }
78
- // Fallback: open file as stream
79
- const file = Bun.file(input.path);
80
- const stream = file.stream() as ReadableStream<Uint8Array>;
81
- return mod.parseStream!(stream, mimeType);
82
- }
83
-
84
- // input.kind === "buffer"
85
- if (hasParseStream) {
86
- // Prefer parseStream for buffers
87
- const stream = new ReadableStream<Uint8Array>({
88
- start(controller) {
89
- controller.enqueue(input.buffer);
90
- controller.close();
91
- },
92
- });
93
- return mod.parseStream!(stream, mimeType);
94
- }
95
-
96
- // Fallback: write buffer to temp file, call parseFile, clean up
97
- const tmpFile = path.join(os.tmpdir(), `struktur-parse-${crypto.randomUUID()}`);
98
- try {
99
- await writeFile(tmpFile, input.buffer);
100
- return await mod.parseFile!(tmpFile, mimeType);
101
- } finally {
102
- await rm(tmpFile, { force: true });
103
- }
104
- };
105
-
106
- const runCommandFileParser = async (
107
- command: string,
108
- input: ParserInput,
109
- ): Promise<Artifact[]> => {
110
- let filePath: string;
111
- let tempFile: string | null = null;
112
-
113
- if (input.kind === "file") {
114
- filePath = input.path;
115
- } else {
116
- // Write buffer to temp file
117
- tempFile = path.join(os.tmpdir(), `struktur-parse-${crypto.randomUUID()}`);
118
- await writeFile(tempFile, input.buffer);
119
- filePath = tempFile;
120
- }
121
-
122
- try {
123
- const interpolated = command.replace(/FILE_PATH/g, filePath);
124
- const stdout = await spawnAndCapture(interpolated);
125
- return parseCommandOutput(stdout);
126
- } finally {
127
- if (tempFile) {
128
- await rm(tempFile, { force: true });
129
- }
130
- }
131
- };
132
-
133
- const runCommandStdinParser = async (
134
- command: string,
135
- input: ParserInput,
136
- ): Promise<Artifact[]> => {
137
- let buffer: Buffer;
138
-
139
- if (input.kind === "file") {
140
- const file = Bun.file(input.path);
141
- buffer = Buffer.from(await file.arrayBuffer());
142
- } else {
143
- buffer = input.buffer;
144
- }
145
-
146
- const stdout = await spawnAndCapture(command, buffer);
147
- return parseCommandOutput(stdout);
148
- };
149
-
150
- export const runParser = async (
151
- def: ParserDef,
152
- input: ParserInput,
153
- mimeType: string,
154
- ): Promise<Artifact[]> => {
155
- switch (def.type) {
156
- case "npm":
157
- return runNpmParser(def.package, input, mimeType);
158
- case "command-file":
159
- return runCommandFileParser(def.command, input);
160
- case "command-stdin":
161
- return runCommandStdinParser(def.command, input);
162
- case "inline": {
163
- let buffer: Buffer;
164
- if (input.kind === "file") {
165
- const file = Bun.file(input.path);
166
- buffer = Buffer.from(await file.arrayBuffer());
167
- } else {
168
- buffer = input.buffer;
169
- }
170
- return [await def.handler(buffer)];
171
- }
172
- default: {
173
- const _exhaustive: never = def;
174
- throw new Error(`Unknown parser type: ${(_exhaustive as { type: string }).type}`);
175
- }
176
- }
177
- };
@@ -1,29 +0,0 @@
1
- import type { Artifact } from "../types";
2
-
3
- export type NpmParserDef = {
4
- type: "npm";
5
- package: string; // e.g. "@myorg/pdf-parser"
6
- };
7
-
8
- export type CommandFileDef = {
9
- type: "command-file";
10
- command: string; // must contain FILE_PATH placeholder
11
- };
12
-
13
- export type CommandStdinDef = {
14
- type: "command-stdin";
15
- command: string;
16
- };
17
-
18
- export type InlineParserDef = {
19
- type: "inline";
20
- handler: (buffer: Buffer) => Promise<Artifact>;
21
- };
22
-
23
- export type ParserDef = NpmParserDef | CommandFileDef | CommandStdinDef | InlineParserDef;
24
-
25
- export type ParsersConfig = Record<string, ParserDef>; // keyed by MIME type
26
-
27
- export type ParserInput =
28
- | { kind: "file"; path: string }
29
- | { kind: "buffer"; buffer: Buffer };
@@ -1,8 +0,0 @@
1
- Prompts module
2
-
3
- - Purpose: generate optimized extraction/merge/dedupe prompt texts with thinking guidance and format artifacts as XML.
4
- - Design: prompt builders return `{ system, user }` strings with concise instructions and `<thinking>` sections; artifacts render into XML blocks with image refs.
5
- - Changes: System prompts now include structured `<thinking>` guidance, `<rules>` sections, and moved `outputInstructions` above schema for better context flow.
6
- - Key files: `ExtractorPrompt.ts`, `SequentialExtractorPrompt.ts`, `ParallelMergerPrompt.ts`, `DeduplicationPrompt.ts`, `formatArtifacts.ts`.
7
- - Design: prompt builders return `{ system, user }` strings; artifacts render into XML blocks with image refs.
8
- - Tests: `ExtractorPrompt.test.ts`, `SequentialExtractorPrompt.test.ts`, `ParallelMergerPrompt.test.ts`, `DeduplicationPrompt.test.ts`.
@@ -1,41 +0,0 @@
1
- import { test, expect } from "bun:test";
2
- import { buildDeduplicationPrompt } from "./DeduplicationPrompt";
3
-
4
- test("buildDeduplicationPrompt includes keys format instructions", () => {
5
- const { system, user } = buildDeduplicationPrompt("{}", { items: [] });
6
- expect(system).toContain("keys");
7
- expect(user).toContain("duplicate");
8
- });
9
-
10
- test("buildDeduplicationPrompt embeds schema", () => {
11
- const schema = '{"type":"object","properties":{"items":{"type":"array"}}}';
12
- const { user } = buildDeduplicationPrompt(schema, { items: [] });
13
- expect(user).toContain(schema);
14
- });
15
-
16
- test("buildDeduplicationPrompt embeds data", () => {
17
- const data = { items: [{ id: 1 }, { id: 1 }] };
18
- const { user } = buildDeduplicationPrompt("{}", data);
19
- expect(user).toContain(JSON.stringify(data));
20
- });
21
-
22
- test("buildDeduplicationPrompt uses default example keys in example", () => {
23
- const { user } = buildDeduplicationPrompt("{}", { items: [] });
24
- expect(user).toContain("items.3");
25
- expect(user).toContain("items.5");
26
- });
27
-
28
- test("buildDeduplicationPrompt includes thinking section", () => {
29
- const { system } = buildDeduplicationPrompt("{}", { items: [] });
30
- expect(system).toContain("<thinking>");
31
- });
32
-
33
- test("buildDeduplicationPrompt includes rules section", () => {
34
- const { system } = buildDeduplicationPrompt("{}", { items: [] });
35
- expect(system).toContain("<rules>");
36
- });
37
-
38
- test("buildDeduplicationPrompt includes task", () => {
39
- const { user } = buildDeduplicationPrompt("{}", { items: [] });
40
- expect(user).toContain("<task>");
41
- });
@@ -1,37 +0,0 @@
1
- export const buildDeduplicationPrompt = (
2
- schema: string,
3
- data: unknown,
4
- exampleKeys: string[] = ["items.3", "items.5"]
5
- ) => {
6
- const system = `You are a deduplication engine. Identify duplicate entries in structured data.
7
-
8
- <thinking>
9
- Before deduplicating, consider:
10
- 1. Which fields indicate uniqueness for each entity type?
11
- 2. Are entries duplicates if they share key fields but differ in minor details?
12
- 3. Which entry should be kept (prefer more complete data)?
13
- </thinking>
14
-
15
- <rules>
16
- - Identify entries that represent the same entity
17
- - Return paths to duplicates using dot notation (e.g., "items.3", "items.5")
18
- - Output ONLY JSON in format: { "keys": ["path1", "path2"] }
19
- - No markdown, no explanations
20
- </rules>`;
21
-
22
- const user = `<json-schema>
23
- ${schema}
24
- </json-schema>
25
-
26
- <json-data>
27
- ${JSON.stringify(data)}
28
- </json-data>
29
-
30
- <task>Identify duplicate entries in the data and return their paths in the format: { "keys": ["path1", "path2"] }</task>
31
-
32
- <example>
33
- If items at indices 3 and 5 are duplicates, return: { "keys": ["items.3", "items.5"] }
34
- </example>`;
35
-
36
- return { system, user };
37
- };
@@ -1,21 +0,0 @@
1
- import { test, expect } from "bun:test";
2
- import { buildExtractorPrompt } from "./ExtractorPrompt";
3
- import type { Artifact } from "../types";
4
-
5
- const artifacts: Artifact[] = [
6
- {
7
- id: "a1",
8
- type: "pdf",
9
- raw: async () => Buffer.from(""),
10
- contents: [
11
- { page: 1, text: "Hello", media: [{ type: "image", url: "https://x" }] },
12
- ],
13
- },
14
- ];
15
-
16
- test("buildExtractorPrompt includes schema and artifacts", () => {
17
- const { system, user } = buildExtractorPrompt(artifacts, "{\"type\":\"object\"}");
18
- expect(system).toContain("<json-schema>");
19
- expect(user).toContain("<artifacts>");
20
- expect(user).toContain("<image");
21
- });
@@ -1,72 +0,0 @@
1
- import { formatArtifactsXml } from "./formatArtifacts";
2
- import type { Artifact } from "../types";
3
-
4
- const extractorSystemPrompt = (schema: string, outputInstructions?: string) => {
5
- return `<instructions>
6
- You are a precise data extraction engine. Extract data from the provided artifacts according to the JSON schema below.
7
-
8
- <thinking>
9
- Before extracting, consider:
10
- 1. Which schema fields have clear values in the artifacts?
11
- 2. Which fields are missing or unclear (set these to null)?
12
- 3. For text fields, rewrite concisely while preserving all information
13
- 4. Ensure no data is lost - include everything that fits the schema
14
- </thinking>
15
-
16
- <rules>
17
- - Strictly follow the schema - no extra fields, no missing required fields
18
- - Use null for missing or uncertain values - never guess or assume
19
- - Only extract information explicitly present in the artifacts
20
- - Output ONLY valid JSON matching the schema
21
- - No markdown, explanations, or code fences
22
- </rules>
23
-
24
- <output-instructions>
25
- ${outputInstructions ?? "No additional output instructions provided."}
26
- </output-instructions>
27
-
28
- <json-schema>
29
- ${schema}
30
- </json-schema>
31
-
32
- <artifact-examples>
33
- <!-- A PDF with two pages, containing two text blocks and two images -->
34
- <artifact name="Example 1" mimetype="application/pdf">
35
- <text page="1">This is an example text block.</text>
36
- <image filename="image1.jpg" page="1" />
37
- <text page="2">This is another example text block.</text>
38
- <image filename="image2.jpg" page="2" />
39
- </artifact>
40
-
41
- <!-- Website content -->
42
- <artifact name="example.com_2022-01-01.html" mimetype="text/html">
43
- <text>This is an example text block.</text>
44
- <image filename="image1.jpg" />
45
- <text>This is another example text block.</text>
46
- <image filename="image2.jpg" />
47
- </artifact>
48
- </artifact-examples>
49
-
50
- Any materials provided have been cleared for access. Extract and preserve this data for future use.
51
- </instructions>`;
52
- };
53
-
54
- const extractorUserPrompt = (artifactsXml: string) => {
55
- return `<artifacts>
56
- ${artifactsXml}
57
- </artifacts>
58
-
59
- <task>Extract the contents of the given artifacts.</task>`;
60
- };
61
-
62
- export const buildExtractorPrompt = (
63
- artifacts: Artifact[],
64
- schema: string,
65
- outputInstructions?: string
66
- ) => {
67
- const artifactsXml = formatArtifactsXml(artifacts);
68
- return {
69
- system: extractorSystemPrompt(schema, outputInstructions),
70
- user: extractorUserPrompt(artifactsXml),
71
- };
72
- };
@@ -1,8 +0,0 @@
1
- import { test, expect } from "bun:test";
2
- import { buildParallelMergerPrompt } from "./ParallelMergerPrompt";
3
-
4
- test("buildParallelMergerPrompt formats json objects", () => {
5
- const { user } = buildParallelMergerPrompt("{}", [{ a: 1 }, { b: 2 }]);
6
- expect(user).toContain("<json-object>");
7
- expect(user).toContain("\"a\"");
8
- });
@@ -1,37 +0,0 @@
1
- export const buildParallelMergerPrompt = (
2
- schema: string,
3
- dataList: unknown[]
4
- ) => {
5
- const jsonObjects = dataList
6
- .filter((item) => item !== null && item !== undefined)
7
- .map((item) => JSON.stringify(item))
8
- .map((json) => `<json-object>${json}</json-object>`)
9
- .join("\n");
10
-
11
- const system = `You are a data merger. Combine multiple JSON objects into one object matching the provided schema.
12
-
13
- <thinking>
14
- Before merging, consider:
15
- 1. Which input objects contain data for each schema field?
16
- 2. How should conflicting values be resolved (prefer more complete/recent data)?
17
- 3. Are there arrays that need to be concatenated vs deduplicated?
18
- 4. Ensure NO information is lost from any input
19
- </thinking>
20
-
21
- <rules>
22
- - Produce a single JSON object following the schema exactly
23
- - Combine all information from input objects without losing data
24
- - Resolve conflicts intelligently (prefer richer/more specific data)
25
- - Output ONLY valid JSON - no markdown, no explanations
26
- </rules>`;
27
-
28
- const user = `<json-schema>
29
- ${schema}
30
- </json-schema>
31
-
32
- <json-objects>
33
- ${jsonObjects}
34
- </json-objects>`;
35
-
36
- return { system, user };
37
- };
@@ -1,24 +0,0 @@
1
- import { test, expect } from "bun:test";
2
- import { buildSequentialPrompt } from "./SequentialExtractorPrompt";
3
- import type { Artifact } from "../types";
4
-
5
- const artifacts: Artifact[] = [
6
- {
7
- id: "a1",
8
- type: "pdf",
9
- raw: async () => Buffer.from(""),
10
- contents: [{ page: 1, text: "Hello" }],
11
- },
12
- ];
13
-
14
- test("buildSequentialPrompt embeds previous data", () => {
15
- const { system, user } = buildSequentialPrompt(
16
- artifacts,
17
- "{\"type\":\"object\"}",
18
- "{\"existing\":true}"
19
- );
20
-
21
- expect(system).toContain("JSON schema");
22
- expect(user).toContain("<previous-data>");
23
- expect(user).toContain("existing");
24
- });
@@ -1,82 +0,0 @@
1
- import { formatArtifactsXml } from "./formatArtifacts";
2
- import type { Artifact } from "../types";
3
-
4
- const sequentialSystemPrompt = (schema: string, outputInstructions?: string) => {
5
- return `<instructions>
6
- You are a precise data extraction engine. Extract data from provided artifacts according to the JSON schema, enriching any previous data you receive.
7
-
8
- <thinking>
9
- Before extracting, consider:
10
- 1. Review previous data - what needs to be preserved vs enriched?
11
- 2. Which new fields have clear values in the artifacts?
12
- 3. Which fields remain missing or unclear (keep null from previous or set to null)?
13
- 4. Can new information improve the structure of existing data?
14
- 5. Ensure NO information is lost from previous data
15
- </thinking>
16
-
17
- <rules>
18
- - Merge new artifacts into existing data - do not create fresh objects
19
- - Preserve ALL previous data - losing information breaks the processing chain
20
- - Use null for missing/uncertain values in new fields
21
- - Only extract information explicitly present in the artifacts
22
- - Output ONLY valid JSON matching the schema
23
- - No markdown, explanations, or code fences
24
- </rules>
25
-
26
- <image-handling>
27
- Some schema properties may reference artifact IDs (e.g., 'xxx_artifact_id' fields).
28
- When assigning images to properties:
29
- - Use format: artifact:ID/images/imageNUM.EXT (e.g., 'artifact:123456/images/image1.jpg')
30
- - Only reference images you can actually see in the provided documents/images
31
- - Image references are visible in artifact XML or written on images
32
- - NEVER make up artifact IDs or use normal URLs
33
- </image-handling>
34
-
35
- <output-instructions>
36
- ${outputInstructions ?? "No additional output instructions provided."}
37
- </output-instructions>
38
-
39
- <json-schema>
40
- ${schema}
41
- </json-schema>
42
-
43
- <how-to-output>
44
- Return the complete extracted data as valid JSON matching the schema.
45
- Include all information from previous data, enriched with the new artifacts.
46
- </how-to-output>
47
- </instructions>`;
48
- };
49
-
50
- const sequentialUserPrompt = (
51
- artifactsXml: string,
52
- previousData: string,
53
- outputInstructions?: string
54
- ) => {
55
- return `${artifactsXml}
56
-
57
- <previous-data>
58
- ${previousData}
59
- </previous-data>
60
-
61
- <task>
62
- Extract the contents of the given artifacts and ADD/MERGE them into the previous data contained in the <previous-data> tag.
63
- You MUST NOT lose any information from the previous data. All previous data must be included in your response.
64
- </task>
65
-
66
- <output-instructions>
67
- ${outputInstructions ?? ""}
68
- </output-instructions>`;
69
- };
70
-
71
- export const buildSequentialPrompt = (
72
- artifacts: Artifact[],
73
- schema: string,
74
- previousData: string,
75
- outputInstructions?: string
76
- ) => {
77
- const artifactsXml = formatArtifactsXml(artifacts);
78
- return {
79
- system: sequentialSystemPrompt(schema, outputInstructions),
80
- user: sequentialUserPrompt(artifactsXml, previousData, outputInstructions),
81
- };
82
- };
@@ -1,39 +0,0 @@
1
- import { test, expect } from "bun:test";
2
- import type { Artifact } from "../types";
3
- import { formatArtifactsXml } from "./formatArtifacts";
4
-
5
- test("formatArtifactsXml escapes text and builds image refs", () => {
6
- const artifacts: Artifact[] = [
7
- {
8
- id: "a&<>\"'",
9
- type: "text",
10
- raw: async () => Buffer.from(""),
11
- contents: [
12
- {
13
- page: 2,
14
- text: "Hello & <world> \"quote\" 'apostrophe'",
15
- },
16
- {
17
- page: 1,
18
- media: [
19
- { type: "image", url: "https://example.com/image.png" },
20
- { type: "image", base64: "abc" },
21
- { type: "image", contents: Buffer.from([1, 2, 3]) },
22
- ],
23
- },
24
- ],
25
- },
26
- ];
27
-
28
- const result = formatArtifactsXml(artifacts);
29
- const expected = [
30
- "<artifact id=\"a&amp;&lt;&gt;&quot;&apos;\" type=\"text\">",
31
- " <text page=\"2\">Hello &amp; &lt;world&gt; &quot;quote&quot; &apos;apostrophe&apos;</text>",
32
- " <image ref=\"https://example.com/image.png\" page=\"1\" />",
33
- " <image ref=\"artifact:a&amp;&lt;&gt;&quot;&apos;/images/image2.png\" page=\"1\" />",
34
- " <image ref=\"artifact:a&amp;&lt;&gt;&quot;&apos;/images/image3.bin\" page=\"1\" />",
35
- "</artifact>",
36
- ].join("\n");
37
-
38
- expect(result).toBe(expected);
39
- });
@@ -1,46 +0,0 @@
1
- import type { Artifact, ArtifactImage } from "../types";
2
-
3
- const imageRefFor = (artifactId: string, index: number, image: ArtifactImage) => {
4
- if (image.url) {
5
- return image.url;
6
- }
7
-
8
- const extension = image.base64 ? "png" : "bin";
9
- return `artifact:${artifactId}/images/image${index + 1}.${extension}`;
10
- };
11
-
12
- const escapeXml = (value: string) => {
13
- return value
14
- .replace(/&/g, "&amp;")
15
- .replace(/</g, "&lt;")
16
- .replace(/>/g, "&gt;")
17
- .replace(/\"/g, "&quot;")
18
- .replace(/'/g, "&apos;");
19
- };
20
-
21
- export const formatArtifactsXml = (artifacts: Artifact[]) => {
22
- const parts: string[] = [];
23
-
24
- for (const artifact of artifacts) {
25
- parts.push(`<artifact id="${escapeXml(artifact.id)}" type="${artifact.type}">`);
26
-
27
- for (const content of artifact.contents) {
28
- if (content.text) {
29
- const pageAttr = content.page !== undefined ? ` page="${content.page}"` : "";
30
- parts.push(` <text${pageAttr}>${escapeXml(content.text)}</text>`);
31
- }
32
-
33
- if (content.media?.length) {
34
- content.media.forEach((media, index) => {
35
- const ref = imageRefFor(artifact.id, index, media);
36
- const pageAttr = content.page !== undefined ? ` page="${content.page}"` : "";
37
- parts.push(` <image ref="${escapeXml(ref)}"${pageAttr} />`);
38
- });
39
- }
40
- }
41
-
42
- parts.push("</artifact>");
43
- }
44
-
45
- return parts.join("\n");
46
- };
@@ -1,6 +0,0 @@
1
- Strategies module
2
-
3
- - Purpose: orchestrate extraction flows (simple, parallel, sequential, auto-merge, double-pass).
4
- - Key files: `SimpleStrategy.ts`, `ParallelStrategy.ts`, `SequentialStrategy.ts`, `ParallelAutoMergeStrategy.ts`, `SequentialAutoMergeStrategy.ts`, `DoublePassStrategy.ts`, `DoublePassAutoMergeStrategy.ts`, `utils.ts`, `concurrency.ts`.
5
- - Design: strategies own config (chunk size, concurrency, models) and call prompt + retry helpers. Strategies emit `events.onStep` updates and implement `getEstimatedSteps` for progress tracking.
6
- - Tests: strategy-specific `*.test.ts` files.