@struktur/sdk 2.1.2 → 2.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/artifacts/fileToArtifact.d.ts +8 -0
- package/dist/artifacts/fileToArtifact.d.ts.map +1 -0
- package/dist/artifacts/input.d.ts +60 -0
- package/dist/artifacts/input.d.ts.map +1 -0
- package/{src/artifacts/providers.ts → dist/artifacts/providers.d.ts} +2 -4
- package/dist/artifacts/providers.d.ts.map +1 -0
- package/dist/artifacts/urlToArtifact.d.ts +3 -0
- package/dist/artifacts/urlToArtifact.d.ts.map +1 -0
- package/dist/auth/config.d.ts +34 -0
- package/dist/auth/config.d.ts.map +1 -0
- package/dist/auth/tokens.d.ts +18 -0
- package/dist/auth/tokens.d.ts.map +1 -0
- package/dist/chunking/ArtifactBatcher.d.ts +11 -0
- package/dist/chunking/ArtifactBatcher.d.ts.map +1 -0
- package/dist/chunking/ArtifactSplitter.d.ts +10 -0
- package/dist/chunking/ArtifactSplitter.d.ts.map +1 -0
- package/dist/debug/logger.d.ts +169 -0
- package/dist/debug/logger.d.ts.map +1 -0
- package/dist/extract.d.ts +3 -0
- package/dist/extract.d.ts.map +1 -0
- package/dist/fields.d.ts +75 -0
- package/dist/fields.d.ts.map +1 -0
- package/dist/index.d.ts +24 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +5603 -0
- package/dist/index.js.map +1 -0
- package/dist/llm/LLMClient.d.ts +40 -0
- package/dist/llm/LLMClient.d.ts.map +1 -0
- package/dist/llm/RetryingRunner.d.ts +37 -0
- package/dist/llm/RetryingRunner.d.ts.map +1 -0
- package/dist/llm/message.d.ts +12 -0
- package/dist/llm/message.d.ts.map +1 -0
- package/dist/llm/models.d.ts +13 -0
- package/dist/llm/models.d.ts.map +1 -0
- package/dist/llm/resolveModel.d.ts +3 -0
- package/dist/llm/resolveModel.d.ts.map +1 -0
- package/dist/merge/Deduplicator.d.ts +4 -0
- package/dist/merge/Deduplicator.d.ts.map +1 -0
- package/dist/merge/SmartDataMerger.d.ts +7 -0
- package/dist/merge/SmartDataMerger.d.ts.map +1 -0
- package/dist/parsers/collect.d.ts +7 -0
- package/dist/parsers/collect.d.ts.map +1 -0
- package/{src/parsers/index.ts → dist/parsers/index.d.ts} +1 -0
- package/dist/parsers/index.d.ts.map +1 -0
- package/dist/parsers/mime.d.ts +12 -0
- package/dist/parsers/mime.d.ts.map +1 -0
- package/dist/parsers/npm.d.ts +16 -0
- package/dist/parsers/npm.d.ts.map +1 -0
- package/dist/parsers/pdf.d.ts +36 -0
- package/dist/parsers/pdf.d.ts.map +1 -0
- package/dist/parsers/runner.d.ts +4 -0
- package/dist/parsers/runner.d.ts.map +1 -0
- package/dist/parsers/types.d.ts +27 -0
- package/dist/parsers/types.d.ts.map +1 -0
- package/dist/parsers.d.ts +1 -0
- package/dist/parsers.js +492 -0
- package/dist/parsers.js.map +1 -0
- package/dist/prompts/DeduplicationPrompt.d.ts +5 -0
- package/dist/prompts/DeduplicationPrompt.d.ts.map +1 -0
- package/dist/prompts/ExtractorPrompt.d.ts +6 -0
- package/dist/prompts/ExtractorPrompt.d.ts.map +1 -0
- package/dist/prompts/ParallelMergerPrompt.d.ts +5 -0
- package/dist/prompts/ParallelMergerPrompt.d.ts.map +1 -0
- package/dist/prompts/SequentialExtractorPrompt.d.ts +6 -0
- package/dist/prompts/SequentialExtractorPrompt.d.ts.map +1 -0
- package/dist/prompts/formatArtifacts.d.ts +3 -0
- package/dist/prompts/formatArtifacts.d.ts.map +1 -0
- package/dist/strategies/DoublePassAutoMergeStrategy.d.ts +23 -0
- package/dist/strategies/DoublePassAutoMergeStrategy.d.ts.map +1 -0
- package/dist/strategies/DoublePassStrategy.d.ts +22 -0
- package/dist/strategies/DoublePassStrategy.d.ts.map +1 -0
- package/dist/strategies/ParallelAutoMergeStrategy.d.ts +27 -0
- package/dist/strategies/ParallelAutoMergeStrategy.d.ts.map +1 -0
- package/dist/strategies/ParallelStrategy.d.ts +22 -0
- package/dist/strategies/ParallelStrategy.d.ts.map +1 -0
- package/dist/strategies/SequentialAutoMergeStrategy.d.ts +22 -0
- package/dist/strategies/SequentialAutoMergeStrategy.d.ts.map +1 -0
- package/dist/strategies/SequentialStrategy.d.ts +20 -0
- package/dist/strategies/SequentialStrategy.d.ts.map +1 -0
- package/dist/strategies/SimpleStrategy.d.ts +18 -0
- package/dist/strategies/SimpleStrategy.d.ts.map +1 -0
- package/dist/strategies/agent/AgentStrategy.d.ts +44 -0
- package/dist/strategies/agent/AgentStrategy.d.ts.map +1 -0
- package/dist/strategies/agent/AgentTools.d.ts +55 -0
- package/dist/strategies/agent/AgentTools.d.ts.map +1 -0
- package/dist/strategies/agent/ArtifactFilesystem.d.ts +51 -0
- package/dist/strategies/agent/ArtifactFilesystem.d.ts.map +1 -0
- package/dist/strategies/agent/index.d.ts +4 -0
- package/dist/strategies/agent/index.d.ts.map +1 -0
- package/dist/strategies/concurrency.d.ts +2 -0
- package/dist/strategies/concurrency.d.ts.map +1 -0
- package/{src/strategies/index.ts → dist/strategies/index.d.ts} +2 -0
- package/dist/strategies/index.d.ts.map +1 -0
- package/dist/strategies/utils.d.ts +39 -0
- package/dist/strategies/utils.d.ts.map +1 -0
- package/dist/strategies.d.ts +1 -0
- package/dist/strategies.js +3930 -0
- package/dist/strategies.js.map +1 -0
- package/dist/tokenization.d.ts +11 -0
- package/dist/tokenization.d.ts.map +1 -0
- package/dist/types.d.ts +178 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/validation/validator.d.ts +20 -0
- package/dist/validation/validator.d.ts.map +1 -0
- package/package.json +30 -14
- package/src/agent-cli-integration.test.ts +0 -47
- package/src/agent-export.test.ts +0 -17
- package/src/agent-tool-labels.test.ts +0 -50
- package/src/artifacts/AGENTS.md +0 -16
- package/src/artifacts/fileToArtifact.test.ts +0 -37
- package/src/artifacts/fileToArtifact.ts +0 -44
- package/src/artifacts/input.test.ts +0 -243
- package/src/artifacts/input.ts +0 -360
- package/src/artifacts/providers.test.ts +0 -19
- package/src/artifacts/urlToArtifact.test.ts +0 -23
- package/src/artifacts/urlToArtifact.ts +0 -19
- package/src/auth/AGENTS.md +0 -11
- package/src/auth/config.test.ts +0 -132
- package/src/auth/config.ts +0 -186
- package/src/auth/tokens.test.ts +0 -58
- package/src/auth/tokens.ts +0 -229
- package/src/chunking/AGENTS.md +0 -11
- package/src/chunking/ArtifactBatcher.test.ts +0 -22
- package/src/chunking/ArtifactBatcher.ts +0 -110
- package/src/chunking/ArtifactSplitter.test.ts +0 -38
- package/src/chunking/ArtifactSplitter.ts +0 -151
- package/src/debug/AGENTS.md +0 -79
- package/src/debug/logger.test.ts +0 -244
- package/src/debug/logger.ts +0 -211
- package/src/extract.test.ts +0 -22
- package/src/extract.ts +0 -150
- package/src/fields.test.ts +0 -681
- package/src/fields.ts +0 -246
- package/src/index.test.ts +0 -20
- package/src/index.ts +0 -110
- package/src/llm/AGENTS.md +0 -9
- package/src/llm/LLMClient.test.ts +0 -394
- package/src/llm/LLMClient.ts +0 -264
- package/src/llm/RetryingRunner.test.ts +0 -174
- package/src/llm/RetryingRunner.ts +0 -270
- package/src/llm/message.test.ts +0 -42
- package/src/llm/message.ts +0 -47
- package/src/llm/models.test.ts +0 -82
- package/src/llm/models.ts +0 -190
- package/src/llm/resolveModel.ts +0 -86
- package/src/merge/AGENTS.md +0 -6
- package/src/merge/Deduplicator.test.ts +0 -108
- package/src/merge/Deduplicator.ts +0 -45
- package/src/merge/SmartDataMerger.test.ts +0 -177
- package/src/merge/SmartDataMerger.ts +0 -56
- package/src/parsers/AGENTS.md +0 -58
- package/src/parsers/collect.test.ts +0 -56
- package/src/parsers/collect.ts +0 -31
- package/src/parsers/mime.test.ts +0 -91
- package/src/parsers/mime.ts +0 -137
- package/src/parsers/npm.ts +0 -26
- package/src/parsers/pdf.test.ts +0 -394
- package/src/parsers/pdf.ts +0 -194
- package/src/parsers/runner.test.ts +0 -95
- package/src/parsers/runner.ts +0 -177
- package/src/parsers/types.ts +0 -29
- package/src/prompts/AGENTS.md +0 -8
- package/src/prompts/DeduplicationPrompt.test.ts +0 -41
- package/src/prompts/DeduplicationPrompt.ts +0 -37
- package/src/prompts/ExtractorPrompt.test.ts +0 -21
- package/src/prompts/ExtractorPrompt.ts +0 -72
- package/src/prompts/ParallelMergerPrompt.test.ts +0 -8
- package/src/prompts/ParallelMergerPrompt.ts +0 -37
- package/src/prompts/SequentialExtractorPrompt.test.ts +0 -24
- package/src/prompts/SequentialExtractorPrompt.ts +0 -82
- package/src/prompts/formatArtifacts.test.ts +0 -39
- package/src/prompts/formatArtifacts.ts +0 -46
- package/src/strategies/AGENTS.md +0 -6
- package/src/strategies/DoublePassAutoMergeStrategy.test.ts +0 -53
- package/src/strategies/DoublePassAutoMergeStrategy.ts +0 -410
- package/src/strategies/DoublePassStrategy.test.ts +0 -48
- package/src/strategies/DoublePassStrategy.ts +0 -266
- package/src/strategies/ParallelAutoMergeStrategy.test.ts +0 -152
- package/src/strategies/ParallelAutoMergeStrategy.ts +0 -345
- package/src/strategies/ParallelStrategy.test.ts +0 -61
- package/src/strategies/ParallelStrategy.ts +0 -208
- package/src/strategies/SequentialAutoMergeStrategy.test.ts +0 -66
- package/src/strategies/SequentialAutoMergeStrategy.ts +0 -325
- package/src/strategies/SequentialStrategy.test.ts +0 -53
- package/src/strategies/SequentialStrategy.ts +0 -142
- package/src/strategies/SimpleStrategy.test.ts +0 -46
- package/src/strategies/SimpleStrategy.ts +0 -94
- package/src/strategies/concurrency.test.ts +0 -16
- package/src/strategies/concurrency.ts +0 -14
- package/src/strategies/index.test.ts +0 -20
- package/src/strategies/utils.test.ts +0 -76
- package/src/strategies/utils.ts +0 -95
- package/src/tokenization.test.ts +0 -119
- package/src/tokenization.ts +0 -71
- package/src/types.test.ts +0 -25
- package/src/types.ts +0 -174
- package/src/validation/AGENTS.md +0 -7
- package/src/validation/validator.test.ts +0 -204
- package/src/validation/validator.ts +0 -90
- package/tsconfig.json +0 -22
package/src/parsers/runner.ts
DELETED
|
@@ -1,177 +0,0 @@
|
|
|
1
|
-
import os from "node:os";
|
|
2
|
-
import path from "node:path";
|
|
3
|
-
import { rm, writeFile } from "node:fs/promises";
|
|
4
|
-
import type { Artifact } from "../types";
|
|
5
|
-
import type { ParserDef, ParserInput } from "./types";
|
|
6
|
-
import type { NpmParserModule } from "./npm";
|
|
7
|
-
import {
|
|
8
|
-
hydrateSerializedArtifacts,
|
|
9
|
-
validateSerializedArtifacts,
|
|
10
|
-
} from "../artifacts/input";
|
|
11
|
-
|
|
12
|
-
const parseCommandOutput = (stdout: string): Artifact[] => {
|
|
13
|
-
let parsed: unknown;
|
|
14
|
-
try {
|
|
15
|
-
parsed = JSON.parse(stdout);
|
|
16
|
-
} catch (error) {
|
|
17
|
-
const message = error instanceof Error ? error.message : String(error);
|
|
18
|
-
throw new Error(`Parser command produced invalid JSON: ${message}\nOutput: ${stdout.slice(0, 200)}`);
|
|
19
|
-
}
|
|
20
|
-
const serialized = validateSerializedArtifacts(parsed);
|
|
21
|
-
return hydrateSerializedArtifacts(serialized);
|
|
22
|
-
};
|
|
23
|
-
|
|
24
|
-
const spawnAndCapture = async (command: string, stdinBuffer?: Buffer): Promise<string> => {
|
|
25
|
-
if (!command.trim()) {
|
|
26
|
-
throw new Error(`Empty command: ${command}`);
|
|
27
|
-
}
|
|
28
|
-
|
|
29
|
-
const proc = Bun.spawn(["sh", "-c", command], {
|
|
30
|
-
stdout: "pipe",
|
|
31
|
-
stderr: "pipe",
|
|
32
|
-
stdin: stdinBuffer ? "pipe" : "ignore",
|
|
33
|
-
});
|
|
34
|
-
|
|
35
|
-
if (stdinBuffer && proc.stdin) {
|
|
36
|
-
// Bun's FileSink uses write/end, not the WritableStream API
|
|
37
|
-
const sink = proc.stdin as { write: (data: Uint8Array) => void; end: () => void };
|
|
38
|
-
sink.write(stdinBuffer);
|
|
39
|
-
sink.end();
|
|
40
|
-
}
|
|
41
|
-
|
|
42
|
-
const [stdout, stderr, exitCode] = await Promise.all([
|
|
43
|
-
new Response(proc.stdout).text(),
|
|
44
|
-
new Response(proc.stderr).text(),
|
|
45
|
-
proc.exited,
|
|
46
|
-
]);
|
|
47
|
-
|
|
48
|
-
if (exitCode !== 0) {
|
|
49
|
-
throw new Error(
|
|
50
|
-
`Parser command exited with code ${exitCode}: ${command}\nStderr: ${stderr.slice(0, 500)}`
|
|
51
|
-
);
|
|
52
|
-
}
|
|
53
|
-
|
|
54
|
-
return stdout;
|
|
55
|
-
};
|
|
56
|
-
|
|
57
|
-
const runNpmParser = async (
|
|
58
|
-
pkg: string,
|
|
59
|
-
input: ParserInput,
|
|
60
|
-
mimeType: string,
|
|
61
|
-
): Promise<Artifact[]> => {
|
|
62
|
-
const mod = (await import(pkg)) as NpmParserModule;
|
|
63
|
-
|
|
64
|
-
const hasParseFile = typeof mod.parseFile === "function";
|
|
65
|
-
const hasParseStream = typeof mod.parseStream === "function";
|
|
66
|
-
|
|
67
|
-
if (!hasParseFile && !hasParseStream) {
|
|
68
|
-
throw new Error(
|
|
69
|
-
`npm parser package "${pkg}" exports neither parseFile nor parseStream`
|
|
70
|
-
);
|
|
71
|
-
}
|
|
72
|
-
|
|
73
|
-
if (input.kind === "file") {
|
|
74
|
-
// Prefer parseFile for zero-copy
|
|
75
|
-
if (hasParseFile) {
|
|
76
|
-
return mod.parseFile!(input.path, mimeType);
|
|
77
|
-
}
|
|
78
|
-
// Fallback: open file as stream
|
|
79
|
-
const file = Bun.file(input.path);
|
|
80
|
-
const stream = file.stream() as ReadableStream<Uint8Array>;
|
|
81
|
-
return mod.parseStream!(stream, mimeType);
|
|
82
|
-
}
|
|
83
|
-
|
|
84
|
-
// input.kind === "buffer"
|
|
85
|
-
if (hasParseStream) {
|
|
86
|
-
// Prefer parseStream for buffers
|
|
87
|
-
const stream = new ReadableStream<Uint8Array>({
|
|
88
|
-
start(controller) {
|
|
89
|
-
controller.enqueue(input.buffer);
|
|
90
|
-
controller.close();
|
|
91
|
-
},
|
|
92
|
-
});
|
|
93
|
-
return mod.parseStream!(stream, mimeType);
|
|
94
|
-
}
|
|
95
|
-
|
|
96
|
-
// Fallback: write buffer to temp file, call parseFile, clean up
|
|
97
|
-
const tmpFile = path.join(os.tmpdir(), `struktur-parse-${crypto.randomUUID()}`);
|
|
98
|
-
try {
|
|
99
|
-
await writeFile(tmpFile, input.buffer);
|
|
100
|
-
return await mod.parseFile!(tmpFile, mimeType);
|
|
101
|
-
} finally {
|
|
102
|
-
await rm(tmpFile, { force: true });
|
|
103
|
-
}
|
|
104
|
-
};
|
|
105
|
-
|
|
106
|
-
const runCommandFileParser = async (
|
|
107
|
-
command: string,
|
|
108
|
-
input: ParserInput,
|
|
109
|
-
): Promise<Artifact[]> => {
|
|
110
|
-
let filePath: string;
|
|
111
|
-
let tempFile: string | null = null;
|
|
112
|
-
|
|
113
|
-
if (input.kind === "file") {
|
|
114
|
-
filePath = input.path;
|
|
115
|
-
} else {
|
|
116
|
-
// Write buffer to temp file
|
|
117
|
-
tempFile = path.join(os.tmpdir(), `struktur-parse-${crypto.randomUUID()}`);
|
|
118
|
-
await writeFile(tempFile, input.buffer);
|
|
119
|
-
filePath = tempFile;
|
|
120
|
-
}
|
|
121
|
-
|
|
122
|
-
try {
|
|
123
|
-
const interpolated = command.replace(/FILE_PATH/g, filePath);
|
|
124
|
-
const stdout = await spawnAndCapture(interpolated);
|
|
125
|
-
return parseCommandOutput(stdout);
|
|
126
|
-
} finally {
|
|
127
|
-
if (tempFile) {
|
|
128
|
-
await rm(tempFile, { force: true });
|
|
129
|
-
}
|
|
130
|
-
}
|
|
131
|
-
};
|
|
132
|
-
|
|
133
|
-
const runCommandStdinParser = async (
|
|
134
|
-
command: string,
|
|
135
|
-
input: ParserInput,
|
|
136
|
-
): Promise<Artifact[]> => {
|
|
137
|
-
let buffer: Buffer;
|
|
138
|
-
|
|
139
|
-
if (input.kind === "file") {
|
|
140
|
-
const file = Bun.file(input.path);
|
|
141
|
-
buffer = Buffer.from(await file.arrayBuffer());
|
|
142
|
-
} else {
|
|
143
|
-
buffer = input.buffer;
|
|
144
|
-
}
|
|
145
|
-
|
|
146
|
-
const stdout = await spawnAndCapture(command, buffer);
|
|
147
|
-
return parseCommandOutput(stdout);
|
|
148
|
-
};
|
|
149
|
-
|
|
150
|
-
export const runParser = async (
|
|
151
|
-
def: ParserDef,
|
|
152
|
-
input: ParserInput,
|
|
153
|
-
mimeType: string,
|
|
154
|
-
): Promise<Artifact[]> => {
|
|
155
|
-
switch (def.type) {
|
|
156
|
-
case "npm":
|
|
157
|
-
return runNpmParser(def.package, input, mimeType);
|
|
158
|
-
case "command-file":
|
|
159
|
-
return runCommandFileParser(def.command, input);
|
|
160
|
-
case "command-stdin":
|
|
161
|
-
return runCommandStdinParser(def.command, input);
|
|
162
|
-
case "inline": {
|
|
163
|
-
let buffer: Buffer;
|
|
164
|
-
if (input.kind === "file") {
|
|
165
|
-
const file = Bun.file(input.path);
|
|
166
|
-
buffer = Buffer.from(await file.arrayBuffer());
|
|
167
|
-
} else {
|
|
168
|
-
buffer = input.buffer;
|
|
169
|
-
}
|
|
170
|
-
return [await def.handler(buffer)];
|
|
171
|
-
}
|
|
172
|
-
default: {
|
|
173
|
-
const _exhaustive: never = def;
|
|
174
|
-
throw new Error(`Unknown parser type: ${(_exhaustive as { type: string }).type}`);
|
|
175
|
-
}
|
|
176
|
-
}
|
|
177
|
-
};
|
package/src/parsers/types.ts
DELETED
|
@@ -1,29 +0,0 @@
|
|
|
1
|
-
import type { Artifact } from "../types";
|
|
2
|
-
|
|
3
|
-
export type NpmParserDef = {
|
|
4
|
-
type: "npm";
|
|
5
|
-
package: string; // e.g. "@myorg/pdf-parser"
|
|
6
|
-
};
|
|
7
|
-
|
|
8
|
-
export type CommandFileDef = {
|
|
9
|
-
type: "command-file";
|
|
10
|
-
command: string; // must contain FILE_PATH placeholder
|
|
11
|
-
};
|
|
12
|
-
|
|
13
|
-
export type CommandStdinDef = {
|
|
14
|
-
type: "command-stdin";
|
|
15
|
-
command: string;
|
|
16
|
-
};
|
|
17
|
-
|
|
18
|
-
export type InlineParserDef = {
|
|
19
|
-
type: "inline";
|
|
20
|
-
handler: (buffer: Buffer) => Promise<Artifact>;
|
|
21
|
-
};
|
|
22
|
-
|
|
23
|
-
export type ParserDef = NpmParserDef | CommandFileDef | CommandStdinDef | InlineParserDef;
|
|
24
|
-
|
|
25
|
-
export type ParsersConfig = Record<string, ParserDef>; // keyed by MIME type
|
|
26
|
-
|
|
27
|
-
export type ParserInput =
|
|
28
|
-
| { kind: "file"; path: string }
|
|
29
|
-
| { kind: "buffer"; buffer: Buffer };
|
package/src/prompts/AGENTS.md
DELETED
|
@@ -1,8 +0,0 @@
|
|
|
1
|
-
Prompts module
|
|
2
|
-
|
|
3
|
-
- Purpose: generate optimized extraction/merge/dedupe prompt texts with thinking guidance and format artifacts as XML.
|
|
4
|
-
- Design: prompt builders return `{ system, user }` strings with concise instructions and `<thinking>` sections; artifacts render into XML blocks with image refs.
|
|
5
|
-
- Changes: System prompts now include structured `<thinking>` guidance, `<rules>` sections, and moved `outputInstructions` above schema for better context flow.
|
|
6
|
-
- Key files: `ExtractorPrompt.ts`, `SequentialExtractorPrompt.ts`, `ParallelMergerPrompt.ts`, `DeduplicationPrompt.ts`, `formatArtifacts.ts`.
|
|
7
|
-
- Design: prompt builders return `{ system, user }` strings; artifacts render into XML blocks with image refs.
|
|
8
|
-
- Tests: `ExtractorPrompt.test.ts`, `SequentialExtractorPrompt.test.ts`, `ParallelMergerPrompt.test.ts`, `DeduplicationPrompt.test.ts`.
|
|
@@ -1,41 +0,0 @@
|
|
|
1
|
-
import { test, expect } from "bun:test";
|
|
2
|
-
import { buildDeduplicationPrompt } from "./DeduplicationPrompt";
|
|
3
|
-
|
|
4
|
-
test("buildDeduplicationPrompt includes keys format instructions", () => {
|
|
5
|
-
const { system, user } = buildDeduplicationPrompt("{}", { items: [] });
|
|
6
|
-
expect(system).toContain("keys");
|
|
7
|
-
expect(user).toContain("duplicate");
|
|
8
|
-
});
|
|
9
|
-
|
|
10
|
-
test("buildDeduplicationPrompt embeds schema", () => {
|
|
11
|
-
const schema = '{"type":"object","properties":{"items":{"type":"array"}}}';
|
|
12
|
-
const { user } = buildDeduplicationPrompt(schema, { items: [] });
|
|
13
|
-
expect(user).toContain(schema);
|
|
14
|
-
});
|
|
15
|
-
|
|
16
|
-
test("buildDeduplicationPrompt embeds data", () => {
|
|
17
|
-
const data = { items: [{ id: 1 }, { id: 1 }] };
|
|
18
|
-
const { user } = buildDeduplicationPrompt("{}", data);
|
|
19
|
-
expect(user).toContain(JSON.stringify(data));
|
|
20
|
-
});
|
|
21
|
-
|
|
22
|
-
test("buildDeduplicationPrompt uses default example keys in example", () => {
|
|
23
|
-
const { user } = buildDeduplicationPrompt("{}", { items: [] });
|
|
24
|
-
expect(user).toContain("items.3");
|
|
25
|
-
expect(user).toContain("items.5");
|
|
26
|
-
});
|
|
27
|
-
|
|
28
|
-
test("buildDeduplicationPrompt includes thinking section", () => {
|
|
29
|
-
const { system } = buildDeduplicationPrompt("{}", { items: [] });
|
|
30
|
-
expect(system).toContain("<thinking>");
|
|
31
|
-
});
|
|
32
|
-
|
|
33
|
-
test("buildDeduplicationPrompt includes rules section", () => {
|
|
34
|
-
const { system } = buildDeduplicationPrompt("{}", { items: [] });
|
|
35
|
-
expect(system).toContain("<rules>");
|
|
36
|
-
});
|
|
37
|
-
|
|
38
|
-
test("buildDeduplicationPrompt includes task", () => {
|
|
39
|
-
const { user } = buildDeduplicationPrompt("{}", { items: [] });
|
|
40
|
-
expect(user).toContain("<task>");
|
|
41
|
-
});
|
|
@@ -1,37 +0,0 @@
|
|
|
1
|
-
export const buildDeduplicationPrompt = (
|
|
2
|
-
schema: string,
|
|
3
|
-
data: unknown,
|
|
4
|
-
exampleKeys: string[] = ["items.3", "items.5"]
|
|
5
|
-
) => {
|
|
6
|
-
const system = `You are a deduplication engine. Identify duplicate entries in structured data.
|
|
7
|
-
|
|
8
|
-
<thinking>
|
|
9
|
-
Before deduplicating, consider:
|
|
10
|
-
1. Which fields indicate uniqueness for each entity type?
|
|
11
|
-
2. Are entries duplicates if they share key fields but differ in minor details?
|
|
12
|
-
3. Which entry should be kept (prefer more complete data)?
|
|
13
|
-
</thinking>
|
|
14
|
-
|
|
15
|
-
<rules>
|
|
16
|
-
- Identify entries that represent the same entity
|
|
17
|
-
- Return paths to duplicates using dot notation (e.g., "items.3", "items.5")
|
|
18
|
-
- Output ONLY JSON in format: { "keys": ["path1", "path2"] }
|
|
19
|
-
- No markdown, no explanations
|
|
20
|
-
</rules>`;
|
|
21
|
-
|
|
22
|
-
const user = `<json-schema>
|
|
23
|
-
${schema}
|
|
24
|
-
</json-schema>
|
|
25
|
-
|
|
26
|
-
<json-data>
|
|
27
|
-
${JSON.stringify(data)}
|
|
28
|
-
</json-data>
|
|
29
|
-
|
|
30
|
-
<task>Identify duplicate entries in the data and return their paths in the format: { "keys": ["path1", "path2"] }</task>
|
|
31
|
-
|
|
32
|
-
<example>
|
|
33
|
-
If items at indices 3 and 5 are duplicates, return: { "keys": ["items.3", "items.5"] }
|
|
34
|
-
</example>`;
|
|
35
|
-
|
|
36
|
-
return { system, user };
|
|
37
|
-
};
|
|
@@ -1,21 +0,0 @@
|
|
|
1
|
-
import { test, expect } from "bun:test";
|
|
2
|
-
import { buildExtractorPrompt } from "./ExtractorPrompt";
|
|
3
|
-
import type { Artifact } from "../types";
|
|
4
|
-
|
|
5
|
-
const artifacts: Artifact[] = [
|
|
6
|
-
{
|
|
7
|
-
id: "a1",
|
|
8
|
-
type: "pdf",
|
|
9
|
-
raw: async () => Buffer.from(""),
|
|
10
|
-
contents: [
|
|
11
|
-
{ page: 1, text: "Hello", media: [{ type: "image", url: "https://x" }] },
|
|
12
|
-
],
|
|
13
|
-
},
|
|
14
|
-
];
|
|
15
|
-
|
|
16
|
-
test("buildExtractorPrompt includes schema and artifacts", () => {
|
|
17
|
-
const { system, user } = buildExtractorPrompt(artifacts, "{\"type\":\"object\"}");
|
|
18
|
-
expect(system).toContain("<json-schema>");
|
|
19
|
-
expect(user).toContain("<artifacts>");
|
|
20
|
-
expect(user).toContain("<image");
|
|
21
|
-
});
|
|
@@ -1,72 +0,0 @@
|
|
|
1
|
-
import { formatArtifactsXml } from "./formatArtifacts";
|
|
2
|
-
import type { Artifact } from "../types";
|
|
3
|
-
|
|
4
|
-
const extractorSystemPrompt = (schema: string, outputInstructions?: string) => {
|
|
5
|
-
return `<instructions>
|
|
6
|
-
You are a precise data extraction engine. Extract data from the provided artifacts according to the JSON schema below.
|
|
7
|
-
|
|
8
|
-
<thinking>
|
|
9
|
-
Before extracting, consider:
|
|
10
|
-
1. Which schema fields have clear values in the artifacts?
|
|
11
|
-
2. Which fields are missing or unclear (set these to null)?
|
|
12
|
-
3. For text fields, rewrite concisely while preserving all information
|
|
13
|
-
4. Ensure no data is lost - include everything that fits the schema
|
|
14
|
-
</thinking>
|
|
15
|
-
|
|
16
|
-
<rules>
|
|
17
|
-
- Strictly follow the schema - no extra fields, no missing required fields
|
|
18
|
-
- Use null for missing or uncertain values - never guess or assume
|
|
19
|
-
- Only extract information explicitly present in the artifacts
|
|
20
|
-
- Output ONLY valid JSON matching the schema
|
|
21
|
-
- No markdown, explanations, or code fences
|
|
22
|
-
</rules>
|
|
23
|
-
|
|
24
|
-
<output-instructions>
|
|
25
|
-
${outputInstructions ?? "No additional output instructions provided."}
|
|
26
|
-
</output-instructions>
|
|
27
|
-
|
|
28
|
-
<json-schema>
|
|
29
|
-
${schema}
|
|
30
|
-
</json-schema>
|
|
31
|
-
|
|
32
|
-
<artifact-examples>
|
|
33
|
-
<!-- A PDF with two pages, containing two text blocks and two images -->
|
|
34
|
-
<artifact name="Example 1" mimetype="application/pdf">
|
|
35
|
-
<text page="1">This is an example text block.</text>
|
|
36
|
-
<image filename="image1.jpg" page="1" />
|
|
37
|
-
<text page="2">This is another example text block.</text>
|
|
38
|
-
<image filename="image2.jpg" page="2" />
|
|
39
|
-
</artifact>
|
|
40
|
-
|
|
41
|
-
<!-- Website content -->
|
|
42
|
-
<artifact name="example.com_2022-01-01.html" mimetype="text/html">
|
|
43
|
-
<text>This is an example text block.</text>
|
|
44
|
-
<image filename="image1.jpg" />
|
|
45
|
-
<text>This is another example text block.</text>
|
|
46
|
-
<image filename="image2.jpg" />
|
|
47
|
-
</artifact>
|
|
48
|
-
</artifact-examples>
|
|
49
|
-
|
|
50
|
-
Any materials provided have been cleared for access. Extract and preserve this data for future use.
|
|
51
|
-
</instructions>`;
|
|
52
|
-
};
|
|
53
|
-
|
|
54
|
-
const extractorUserPrompt = (artifactsXml: string) => {
|
|
55
|
-
return `<artifacts>
|
|
56
|
-
${artifactsXml}
|
|
57
|
-
</artifacts>
|
|
58
|
-
|
|
59
|
-
<task>Extract the contents of the given artifacts.</task>`;
|
|
60
|
-
};
|
|
61
|
-
|
|
62
|
-
export const buildExtractorPrompt = (
|
|
63
|
-
artifacts: Artifact[],
|
|
64
|
-
schema: string,
|
|
65
|
-
outputInstructions?: string
|
|
66
|
-
) => {
|
|
67
|
-
const artifactsXml = formatArtifactsXml(artifacts);
|
|
68
|
-
return {
|
|
69
|
-
system: extractorSystemPrompt(schema, outputInstructions),
|
|
70
|
-
user: extractorUserPrompt(artifactsXml),
|
|
71
|
-
};
|
|
72
|
-
};
|
|
@@ -1,8 +0,0 @@
|
|
|
1
|
-
import { test, expect } from "bun:test";
|
|
2
|
-
import { buildParallelMergerPrompt } from "./ParallelMergerPrompt";
|
|
3
|
-
|
|
4
|
-
test("buildParallelMergerPrompt formats json objects", () => {
|
|
5
|
-
const { user } = buildParallelMergerPrompt("{}", [{ a: 1 }, { b: 2 }]);
|
|
6
|
-
expect(user).toContain("<json-object>");
|
|
7
|
-
expect(user).toContain("\"a\"");
|
|
8
|
-
});
|
|
@@ -1,37 +0,0 @@
|
|
|
1
|
-
export const buildParallelMergerPrompt = (
|
|
2
|
-
schema: string,
|
|
3
|
-
dataList: unknown[]
|
|
4
|
-
) => {
|
|
5
|
-
const jsonObjects = dataList
|
|
6
|
-
.filter((item) => item !== null && item !== undefined)
|
|
7
|
-
.map((item) => JSON.stringify(item))
|
|
8
|
-
.map((json) => `<json-object>${json}</json-object>`)
|
|
9
|
-
.join("\n");
|
|
10
|
-
|
|
11
|
-
const system = `You are a data merger. Combine multiple JSON objects into one object matching the provided schema.
|
|
12
|
-
|
|
13
|
-
<thinking>
|
|
14
|
-
Before merging, consider:
|
|
15
|
-
1. Which input objects contain data for each schema field?
|
|
16
|
-
2. How should conflicting values be resolved (prefer more complete/recent data)?
|
|
17
|
-
3. Are there arrays that need to be concatenated vs deduplicated?
|
|
18
|
-
4. Ensure NO information is lost from any input
|
|
19
|
-
</thinking>
|
|
20
|
-
|
|
21
|
-
<rules>
|
|
22
|
-
- Produce a single JSON object following the schema exactly
|
|
23
|
-
- Combine all information from input objects without losing data
|
|
24
|
-
- Resolve conflicts intelligently (prefer richer/more specific data)
|
|
25
|
-
- Output ONLY valid JSON - no markdown, no explanations
|
|
26
|
-
</rules>`;
|
|
27
|
-
|
|
28
|
-
const user = `<json-schema>
|
|
29
|
-
${schema}
|
|
30
|
-
</json-schema>
|
|
31
|
-
|
|
32
|
-
<json-objects>
|
|
33
|
-
${jsonObjects}
|
|
34
|
-
</json-objects>`;
|
|
35
|
-
|
|
36
|
-
return { system, user };
|
|
37
|
-
};
|
|
@@ -1,24 +0,0 @@
|
|
|
1
|
-
import { test, expect } from "bun:test";
|
|
2
|
-
import { buildSequentialPrompt } from "./SequentialExtractorPrompt";
|
|
3
|
-
import type { Artifact } from "../types";
|
|
4
|
-
|
|
5
|
-
const artifacts: Artifact[] = [
|
|
6
|
-
{
|
|
7
|
-
id: "a1",
|
|
8
|
-
type: "pdf",
|
|
9
|
-
raw: async () => Buffer.from(""),
|
|
10
|
-
contents: [{ page: 1, text: "Hello" }],
|
|
11
|
-
},
|
|
12
|
-
];
|
|
13
|
-
|
|
14
|
-
test("buildSequentialPrompt embeds previous data", () => {
|
|
15
|
-
const { system, user } = buildSequentialPrompt(
|
|
16
|
-
artifacts,
|
|
17
|
-
"{\"type\":\"object\"}",
|
|
18
|
-
"{\"existing\":true}"
|
|
19
|
-
);
|
|
20
|
-
|
|
21
|
-
expect(system).toContain("JSON schema");
|
|
22
|
-
expect(user).toContain("<previous-data>");
|
|
23
|
-
expect(user).toContain("existing");
|
|
24
|
-
});
|
|
@@ -1,82 +0,0 @@
|
|
|
1
|
-
import { formatArtifactsXml } from "./formatArtifacts";
|
|
2
|
-
import type { Artifact } from "../types";
|
|
3
|
-
|
|
4
|
-
const sequentialSystemPrompt = (schema: string, outputInstructions?: string) => {
|
|
5
|
-
return `<instructions>
|
|
6
|
-
You are a precise data extraction engine. Extract data from provided artifacts according to the JSON schema, enriching any previous data you receive.
|
|
7
|
-
|
|
8
|
-
<thinking>
|
|
9
|
-
Before extracting, consider:
|
|
10
|
-
1. Review previous data - what needs to be preserved vs enriched?
|
|
11
|
-
2. Which new fields have clear values in the artifacts?
|
|
12
|
-
3. Which fields remain missing or unclear (keep null from previous or set to null)?
|
|
13
|
-
4. Can new information improve the structure of existing data?
|
|
14
|
-
5. Ensure NO information is lost from previous data
|
|
15
|
-
</thinking>
|
|
16
|
-
|
|
17
|
-
<rules>
|
|
18
|
-
- Merge new artifacts into existing data - do not create fresh objects
|
|
19
|
-
- Preserve ALL previous data - losing information breaks the processing chain
|
|
20
|
-
- Use null for missing/uncertain values in new fields
|
|
21
|
-
- Only extract information explicitly present in the artifacts
|
|
22
|
-
- Output ONLY valid JSON matching the schema
|
|
23
|
-
- No markdown, explanations, or code fences
|
|
24
|
-
</rules>
|
|
25
|
-
|
|
26
|
-
<image-handling>
|
|
27
|
-
Some schema properties may reference artifact IDs (e.g., 'xxx_artifact_id' fields).
|
|
28
|
-
When assigning images to properties:
|
|
29
|
-
- Use format: artifact:ID/images/imageNUM.EXT (e.g., 'artifact:123456/images/image1.jpg')
|
|
30
|
-
- Only reference images you can actually see in the provided documents/images
|
|
31
|
-
- Image references are visible in artifact XML or written on images
|
|
32
|
-
- NEVER make up artifact IDs or use normal URLs
|
|
33
|
-
</image-handling>
|
|
34
|
-
|
|
35
|
-
<output-instructions>
|
|
36
|
-
${outputInstructions ?? "No additional output instructions provided."}
|
|
37
|
-
</output-instructions>
|
|
38
|
-
|
|
39
|
-
<json-schema>
|
|
40
|
-
${schema}
|
|
41
|
-
</json-schema>
|
|
42
|
-
|
|
43
|
-
<how-to-output>
|
|
44
|
-
Return the complete extracted data as valid JSON matching the schema.
|
|
45
|
-
Include all information from previous data, enriched with the new artifacts.
|
|
46
|
-
</how-to-output>
|
|
47
|
-
</instructions>`;
|
|
48
|
-
};
|
|
49
|
-
|
|
50
|
-
const sequentialUserPrompt = (
|
|
51
|
-
artifactsXml: string,
|
|
52
|
-
previousData: string,
|
|
53
|
-
outputInstructions?: string
|
|
54
|
-
) => {
|
|
55
|
-
return `${artifactsXml}
|
|
56
|
-
|
|
57
|
-
<previous-data>
|
|
58
|
-
${previousData}
|
|
59
|
-
</previous-data>
|
|
60
|
-
|
|
61
|
-
<task>
|
|
62
|
-
Extract the contents of the given artifacts and ADD/MERGE them into the previous data contained in the <previous-data> tag.
|
|
63
|
-
You MUST NOT lose any information from the previous data. All previous data must be included in your response.
|
|
64
|
-
</task>
|
|
65
|
-
|
|
66
|
-
<output-instructions>
|
|
67
|
-
${outputInstructions ?? ""}
|
|
68
|
-
</output-instructions>`;
|
|
69
|
-
};
|
|
70
|
-
|
|
71
|
-
export const buildSequentialPrompt = (
|
|
72
|
-
artifacts: Artifact[],
|
|
73
|
-
schema: string,
|
|
74
|
-
previousData: string,
|
|
75
|
-
outputInstructions?: string
|
|
76
|
-
) => {
|
|
77
|
-
const artifactsXml = formatArtifactsXml(artifacts);
|
|
78
|
-
return {
|
|
79
|
-
system: sequentialSystemPrompt(schema, outputInstructions),
|
|
80
|
-
user: sequentialUserPrompt(artifactsXml, previousData, outputInstructions),
|
|
81
|
-
};
|
|
82
|
-
};
|
|
@@ -1,39 +0,0 @@
|
|
|
1
|
-
import { test, expect } from "bun:test";
|
|
2
|
-
import type { Artifact } from "../types";
|
|
3
|
-
import { formatArtifactsXml } from "./formatArtifacts";
|
|
4
|
-
|
|
5
|
-
test("formatArtifactsXml escapes text and builds image refs", () => {
|
|
6
|
-
const artifacts: Artifact[] = [
|
|
7
|
-
{
|
|
8
|
-
id: "a&<>\"'",
|
|
9
|
-
type: "text",
|
|
10
|
-
raw: async () => Buffer.from(""),
|
|
11
|
-
contents: [
|
|
12
|
-
{
|
|
13
|
-
page: 2,
|
|
14
|
-
text: "Hello & <world> \"quote\" 'apostrophe'",
|
|
15
|
-
},
|
|
16
|
-
{
|
|
17
|
-
page: 1,
|
|
18
|
-
media: [
|
|
19
|
-
{ type: "image", url: "https://example.com/image.png" },
|
|
20
|
-
{ type: "image", base64: "abc" },
|
|
21
|
-
{ type: "image", contents: Buffer.from([1, 2, 3]) },
|
|
22
|
-
],
|
|
23
|
-
},
|
|
24
|
-
],
|
|
25
|
-
},
|
|
26
|
-
];
|
|
27
|
-
|
|
28
|
-
const result = formatArtifactsXml(artifacts);
|
|
29
|
-
const expected = [
|
|
30
|
-
"<artifact id=\"a&<>"'\" type=\"text\">",
|
|
31
|
-
" <text page=\"2\">Hello & <world> "quote" 'apostrophe'</text>",
|
|
32
|
-
" <image ref=\"https://example.com/image.png\" page=\"1\" />",
|
|
33
|
-
" <image ref=\"artifact:a&<>"'/images/image2.png\" page=\"1\" />",
|
|
34
|
-
" <image ref=\"artifact:a&<>"'/images/image3.bin\" page=\"1\" />",
|
|
35
|
-
"</artifact>",
|
|
36
|
-
].join("\n");
|
|
37
|
-
|
|
38
|
-
expect(result).toBe(expected);
|
|
39
|
-
});
|
|
@@ -1,46 +0,0 @@
|
|
|
1
|
-
import type { Artifact, ArtifactImage } from "../types";
|
|
2
|
-
|
|
3
|
-
const imageRefFor = (artifactId: string, index: number, image: ArtifactImage) => {
|
|
4
|
-
if (image.url) {
|
|
5
|
-
return image.url;
|
|
6
|
-
}
|
|
7
|
-
|
|
8
|
-
const extension = image.base64 ? "png" : "bin";
|
|
9
|
-
return `artifact:${artifactId}/images/image${index + 1}.${extension}`;
|
|
10
|
-
};
|
|
11
|
-
|
|
12
|
-
const escapeXml = (value: string) => {
|
|
13
|
-
return value
|
|
14
|
-
.replace(/&/g, "&")
|
|
15
|
-
.replace(/</g, "<")
|
|
16
|
-
.replace(/>/g, ">")
|
|
17
|
-
.replace(/\"/g, """)
|
|
18
|
-
.replace(/'/g, "'");
|
|
19
|
-
};
|
|
20
|
-
|
|
21
|
-
export const formatArtifactsXml = (artifacts: Artifact[]) => {
|
|
22
|
-
const parts: string[] = [];
|
|
23
|
-
|
|
24
|
-
for (const artifact of artifacts) {
|
|
25
|
-
parts.push(`<artifact id="${escapeXml(artifact.id)}" type="${artifact.type}">`);
|
|
26
|
-
|
|
27
|
-
for (const content of artifact.contents) {
|
|
28
|
-
if (content.text) {
|
|
29
|
-
const pageAttr = content.page !== undefined ? ` page="${content.page}"` : "";
|
|
30
|
-
parts.push(` <text${pageAttr}>${escapeXml(content.text)}</text>`);
|
|
31
|
-
}
|
|
32
|
-
|
|
33
|
-
if (content.media?.length) {
|
|
34
|
-
content.media.forEach((media, index) => {
|
|
35
|
-
const ref = imageRefFor(artifact.id, index, media);
|
|
36
|
-
const pageAttr = content.page !== undefined ? ` page="${content.page}"` : "";
|
|
37
|
-
parts.push(` <image ref="${escapeXml(ref)}"${pageAttr} />`);
|
|
38
|
-
});
|
|
39
|
-
}
|
|
40
|
-
}
|
|
41
|
-
|
|
42
|
-
parts.push("</artifact>");
|
|
43
|
-
}
|
|
44
|
-
|
|
45
|
-
return parts.join("\n");
|
|
46
|
-
};
|
package/src/strategies/AGENTS.md
DELETED
|
@@ -1,6 +0,0 @@
|
|
|
1
|
-
Strategies module
|
|
2
|
-
|
|
3
|
-
- Purpose: orchestrate extraction flows (simple, parallel, sequential, auto-merge, double-pass).
|
|
4
|
-
- Key files: `SimpleStrategy.ts`, `ParallelStrategy.ts`, `SequentialStrategy.ts`, `ParallelAutoMergeStrategy.ts`, `SequentialAutoMergeStrategy.ts`, `DoublePassStrategy.ts`, `DoublePassAutoMergeStrategy.ts`, `utils.ts`, `concurrency.ts`.
|
|
5
|
-
- Design: strategies own config (chunk size, concurrency, models) and call prompt + retry helpers. Strategies emit `events.onStep` updates and implement `getEstimatedSteps` for progress tracking.
|
|
6
|
-
- Tests: strategy-specific `*.test.ts` files.
|