@struktur/sdk 2.1.2 → 2.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/artifacts/fileToArtifact.d.ts +8 -0
- package/dist/artifacts/fileToArtifact.d.ts.map +1 -0
- package/dist/artifacts/input.d.ts +60 -0
- package/dist/artifacts/input.d.ts.map +1 -0
- package/{src/artifacts/providers.ts → dist/artifacts/providers.d.ts} +2 -4
- package/dist/artifacts/providers.d.ts.map +1 -0
- package/dist/artifacts/urlToArtifact.d.ts +3 -0
- package/dist/artifacts/urlToArtifact.d.ts.map +1 -0
- package/dist/auth/config.d.ts +34 -0
- package/dist/auth/config.d.ts.map +1 -0
- package/dist/auth/tokens.d.ts +18 -0
- package/dist/auth/tokens.d.ts.map +1 -0
- package/dist/chunking/ArtifactBatcher.d.ts +11 -0
- package/dist/chunking/ArtifactBatcher.d.ts.map +1 -0
- package/dist/chunking/ArtifactSplitter.d.ts +10 -0
- package/dist/chunking/ArtifactSplitter.d.ts.map +1 -0
- package/dist/debug/logger.d.ts +169 -0
- package/dist/debug/logger.d.ts.map +1 -0
- package/dist/extract.d.ts +3 -0
- package/dist/extract.d.ts.map +1 -0
- package/dist/fields.d.ts +75 -0
- package/dist/fields.d.ts.map +1 -0
- package/dist/index.d.ts +24 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +5603 -0
- package/dist/index.js.map +1 -0
- package/dist/llm/LLMClient.d.ts +40 -0
- package/dist/llm/LLMClient.d.ts.map +1 -0
- package/dist/llm/RetryingRunner.d.ts +37 -0
- package/dist/llm/RetryingRunner.d.ts.map +1 -0
- package/dist/llm/message.d.ts +12 -0
- package/dist/llm/message.d.ts.map +1 -0
- package/dist/llm/models.d.ts +13 -0
- package/dist/llm/models.d.ts.map +1 -0
- package/dist/llm/resolveModel.d.ts +3 -0
- package/dist/llm/resolveModel.d.ts.map +1 -0
- package/dist/merge/Deduplicator.d.ts +4 -0
- package/dist/merge/Deduplicator.d.ts.map +1 -0
- package/dist/merge/SmartDataMerger.d.ts +7 -0
- package/dist/merge/SmartDataMerger.d.ts.map +1 -0
- package/dist/parsers/collect.d.ts +7 -0
- package/dist/parsers/collect.d.ts.map +1 -0
- package/{src/parsers/index.ts → dist/parsers/index.d.ts} +1 -0
- package/dist/parsers/index.d.ts.map +1 -0
- package/dist/parsers/mime.d.ts +12 -0
- package/dist/parsers/mime.d.ts.map +1 -0
- package/dist/parsers/npm.d.ts +16 -0
- package/dist/parsers/npm.d.ts.map +1 -0
- package/dist/parsers/pdf.d.ts +36 -0
- package/dist/parsers/pdf.d.ts.map +1 -0
- package/dist/parsers/runner.d.ts +4 -0
- package/dist/parsers/runner.d.ts.map +1 -0
- package/dist/parsers/types.d.ts +27 -0
- package/dist/parsers/types.d.ts.map +1 -0
- package/dist/parsers.d.ts +1 -0
- package/dist/parsers.js +492 -0
- package/dist/parsers.js.map +1 -0
- package/dist/prompts/DeduplicationPrompt.d.ts +5 -0
- package/dist/prompts/DeduplicationPrompt.d.ts.map +1 -0
- package/dist/prompts/ExtractorPrompt.d.ts +6 -0
- package/dist/prompts/ExtractorPrompt.d.ts.map +1 -0
- package/dist/prompts/ParallelMergerPrompt.d.ts +5 -0
- package/dist/prompts/ParallelMergerPrompt.d.ts.map +1 -0
- package/dist/prompts/SequentialExtractorPrompt.d.ts +6 -0
- package/dist/prompts/SequentialExtractorPrompt.d.ts.map +1 -0
- package/dist/prompts/formatArtifacts.d.ts +3 -0
- package/dist/prompts/formatArtifacts.d.ts.map +1 -0
- package/dist/strategies/DoublePassAutoMergeStrategy.d.ts +23 -0
- package/dist/strategies/DoublePassAutoMergeStrategy.d.ts.map +1 -0
- package/dist/strategies/DoublePassStrategy.d.ts +22 -0
- package/dist/strategies/DoublePassStrategy.d.ts.map +1 -0
- package/dist/strategies/ParallelAutoMergeStrategy.d.ts +27 -0
- package/dist/strategies/ParallelAutoMergeStrategy.d.ts.map +1 -0
- package/dist/strategies/ParallelStrategy.d.ts +22 -0
- package/dist/strategies/ParallelStrategy.d.ts.map +1 -0
- package/dist/strategies/SequentialAutoMergeStrategy.d.ts +22 -0
- package/dist/strategies/SequentialAutoMergeStrategy.d.ts.map +1 -0
- package/dist/strategies/SequentialStrategy.d.ts +20 -0
- package/dist/strategies/SequentialStrategy.d.ts.map +1 -0
- package/dist/strategies/SimpleStrategy.d.ts +18 -0
- package/dist/strategies/SimpleStrategy.d.ts.map +1 -0
- package/dist/strategies/agent/AgentStrategy.d.ts +44 -0
- package/dist/strategies/agent/AgentStrategy.d.ts.map +1 -0
- package/dist/strategies/agent/AgentTools.d.ts +55 -0
- package/dist/strategies/agent/AgentTools.d.ts.map +1 -0
- package/dist/strategies/agent/ArtifactFilesystem.d.ts +51 -0
- package/dist/strategies/agent/ArtifactFilesystem.d.ts.map +1 -0
- package/dist/strategies/agent/index.d.ts +4 -0
- package/dist/strategies/agent/index.d.ts.map +1 -0
- package/dist/strategies/concurrency.d.ts +2 -0
- package/dist/strategies/concurrency.d.ts.map +1 -0
- package/{src/strategies/index.ts → dist/strategies/index.d.ts} +2 -0
- package/dist/strategies/index.d.ts.map +1 -0
- package/dist/strategies/utils.d.ts +39 -0
- package/dist/strategies/utils.d.ts.map +1 -0
- package/dist/strategies.d.ts +1 -0
- package/dist/strategies.js +3930 -0
- package/dist/strategies.js.map +1 -0
- package/dist/tokenization.d.ts +11 -0
- package/dist/tokenization.d.ts.map +1 -0
- package/dist/types.d.ts +178 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/validation/validator.d.ts +20 -0
- package/dist/validation/validator.d.ts.map +1 -0
- package/package.json +30 -14
- package/src/agent-cli-integration.test.ts +0 -47
- package/src/agent-export.test.ts +0 -17
- package/src/agent-tool-labels.test.ts +0 -50
- package/src/artifacts/AGENTS.md +0 -16
- package/src/artifacts/fileToArtifact.test.ts +0 -37
- package/src/artifacts/fileToArtifact.ts +0 -44
- package/src/artifacts/input.test.ts +0 -243
- package/src/artifacts/input.ts +0 -360
- package/src/artifacts/providers.test.ts +0 -19
- package/src/artifacts/urlToArtifact.test.ts +0 -23
- package/src/artifacts/urlToArtifact.ts +0 -19
- package/src/auth/AGENTS.md +0 -11
- package/src/auth/config.test.ts +0 -132
- package/src/auth/config.ts +0 -186
- package/src/auth/tokens.test.ts +0 -58
- package/src/auth/tokens.ts +0 -229
- package/src/chunking/AGENTS.md +0 -11
- package/src/chunking/ArtifactBatcher.test.ts +0 -22
- package/src/chunking/ArtifactBatcher.ts +0 -110
- package/src/chunking/ArtifactSplitter.test.ts +0 -38
- package/src/chunking/ArtifactSplitter.ts +0 -151
- package/src/debug/AGENTS.md +0 -79
- package/src/debug/logger.test.ts +0 -244
- package/src/debug/logger.ts +0 -211
- package/src/extract.test.ts +0 -22
- package/src/extract.ts +0 -150
- package/src/fields.test.ts +0 -681
- package/src/fields.ts +0 -246
- package/src/index.test.ts +0 -20
- package/src/index.ts +0 -110
- package/src/llm/AGENTS.md +0 -9
- package/src/llm/LLMClient.test.ts +0 -394
- package/src/llm/LLMClient.ts +0 -264
- package/src/llm/RetryingRunner.test.ts +0 -174
- package/src/llm/RetryingRunner.ts +0 -270
- package/src/llm/message.test.ts +0 -42
- package/src/llm/message.ts +0 -47
- package/src/llm/models.test.ts +0 -82
- package/src/llm/models.ts +0 -190
- package/src/llm/resolveModel.ts +0 -86
- package/src/merge/AGENTS.md +0 -6
- package/src/merge/Deduplicator.test.ts +0 -108
- package/src/merge/Deduplicator.ts +0 -45
- package/src/merge/SmartDataMerger.test.ts +0 -177
- package/src/merge/SmartDataMerger.ts +0 -56
- package/src/parsers/AGENTS.md +0 -58
- package/src/parsers/collect.test.ts +0 -56
- package/src/parsers/collect.ts +0 -31
- package/src/parsers/mime.test.ts +0 -91
- package/src/parsers/mime.ts +0 -137
- package/src/parsers/npm.ts +0 -26
- package/src/parsers/pdf.test.ts +0 -394
- package/src/parsers/pdf.ts +0 -194
- package/src/parsers/runner.test.ts +0 -95
- package/src/parsers/runner.ts +0 -177
- package/src/parsers/types.ts +0 -29
- package/src/prompts/AGENTS.md +0 -8
- package/src/prompts/DeduplicationPrompt.test.ts +0 -41
- package/src/prompts/DeduplicationPrompt.ts +0 -37
- package/src/prompts/ExtractorPrompt.test.ts +0 -21
- package/src/prompts/ExtractorPrompt.ts +0 -72
- package/src/prompts/ParallelMergerPrompt.test.ts +0 -8
- package/src/prompts/ParallelMergerPrompt.ts +0 -37
- package/src/prompts/SequentialExtractorPrompt.test.ts +0 -24
- package/src/prompts/SequentialExtractorPrompt.ts +0 -82
- package/src/prompts/formatArtifacts.test.ts +0 -39
- package/src/prompts/formatArtifacts.ts +0 -46
- package/src/strategies/AGENTS.md +0 -6
- package/src/strategies/DoublePassAutoMergeStrategy.test.ts +0 -53
- package/src/strategies/DoublePassAutoMergeStrategy.ts +0 -410
- package/src/strategies/DoublePassStrategy.test.ts +0 -48
- package/src/strategies/DoublePassStrategy.ts +0 -266
- package/src/strategies/ParallelAutoMergeStrategy.test.ts +0 -152
- package/src/strategies/ParallelAutoMergeStrategy.ts +0 -345
- package/src/strategies/ParallelStrategy.test.ts +0 -61
- package/src/strategies/ParallelStrategy.ts +0 -208
- package/src/strategies/SequentialAutoMergeStrategy.test.ts +0 -66
- package/src/strategies/SequentialAutoMergeStrategy.ts +0 -325
- package/src/strategies/SequentialStrategy.test.ts +0 -53
- package/src/strategies/SequentialStrategy.ts +0 -142
- package/src/strategies/SimpleStrategy.test.ts +0 -46
- package/src/strategies/SimpleStrategy.ts +0 -94
- package/src/strategies/concurrency.test.ts +0 -16
- package/src/strategies/concurrency.ts +0 -14
- package/src/strategies/index.test.ts +0 -20
- package/src/strategies/utils.test.ts +0 -76
- package/src/strategies/utils.ts +0 -95
- package/src/tokenization.test.ts +0 -119
- package/src/tokenization.ts +0 -71
- package/src/types.test.ts +0 -25
- package/src/types.ts +0 -174
- package/src/validation/AGENTS.md +0 -7
- package/src/validation/validator.test.ts +0 -204
- package/src/validation/validator.ts +0 -90
- package/tsconfig.json +0 -22
|
@@ -1,20 +0,0 @@
|
|
|
1
|
-
import { test, expect } from "bun:test";
|
|
2
|
-
import * as strategies from "./index";
|
|
3
|
-
|
|
4
|
-
test("strategies index re-exports constructors and helpers", () => {
|
|
5
|
-
expect(typeof strategies.SimpleStrategy).toBe("function");
|
|
6
|
-
expect(typeof strategies.ParallelStrategy).toBe("function");
|
|
7
|
-
expect(typeof strategies.SequentialStrategy).toBe("function");
|
|
8
|
-
expect(typeof strategies.ParallelAutoMergeStrategy).toBe("function");
|
|
9
|
-
expect(typeof strategies.SequentialAutoMergeStrategy).toBe("function");
|
|
10
|
-
expect(typeof strategies.DoublePassStrategy).toBe("function");
|
|
11
|
-
expect(typeof strategies.DoublePassAutoMergeStrategy).toBe("function");
|
|
12
|
-
|
|
13
|
-
expect(typeof strategies.simple).toBe("function");
|
|
14
|
-
expect(typeof strategies.parallel).toBe("function");
|
|
15
|
-
expect(typeof strategies.sequential).toBe("function");
|
|
16
|
-
expect(typeof strategies.parallelAutoMerge).toBe("function");
|
|
17
|
-
expect(typeof strategies.sequentialAutoMerge).toBe("function");
|
|
18
|
-
expect(typeof strategies.doublePass).toBe("function");
|
|
19
|
-
expect(typeof strategies.doublePassAutoMerge).toBe("function");
|
|
20
|
-
});
|
|
@@ -1,76 +0,0 @@
|
|
|
1
|
-
import { test, expect } from "bun:test";
|
|
2
|
-
import type { Artifact } from "../types";
|
|
3
|
-
import type { JSONSchemaType } from "ajv";
|
|
4
|
-
import { batchArtifacts } from "../chunking/ArtifactBatcher";
|
|
5
|
-
import { serializeSchema, mergeUsage, getBatches, extractWithPrompt } from "./utils";
|
|
6
|
-
|
|
7
|
-
type Output = { title: string };
|
|
8
|
-
|
|
9
|
-
const schema: JSONSchemaType<Output> = {
|
|
10
|
-
type: "object",
|
|
11
|
-
properties: { title: { type: "string" } },
|
|
12
|
-
required: ["title"],
|
|
13
|
-
additionalProperties: false,
|
|
14
|
-
};
|
|
15
|
-
|
|
16
|
-
const makeArtifact = (id: string, text: string): Artifact => ({
|
|
17
|
-
id,
|
|
18
|
-
type: "text",
|
|
19
|
-
raw: async () => Buffer.from(text),
|
|
20
|
-
contents: [{ text }],
|
|
21
|
-
});
|
|
22
|
-
|
|
23
|
-
test("serializeSchema returns JSON", () => {
|
|
24
|
-
expect(serializeSchema({ ok: true })).toBe("{\"ok\":true}");
|
|
25
|
-
});
|
|
26
|
-
|
|
27
|
-
test("mergeUsage sums token usage", () => {
|
|
28
|
-
const usage = mergeUsage([
|
|
29
|
-
{ inputTokens: 1, outputTokens: 2, totalTokens: 3 },
|
|
30
|
-
{ inputTokens: 4, outputTokens: 5, totalTokens: 9 },
|
|
31
|
-
]);
|
|
32
|
-
|
|
33
|
-
expect(usage).toEqual({ inputTokens: 5, outputTokens: 7, totalTokens: 12 });
|
|
34
|
-
});
|
|
35
|
-
|
|
36
|
-
test("getBatches delegates to batchArtifacts", () => {
|
|
37
|
-
const artifacts = [makeArtifact("a1", "hello"), makeArtifact("a2", "world")];
|
|
38
|
-
const options = { maxTokens: 1 };
|
|
39
|
-
|
|
40
|
-
expect(getBatches(artifacts, options)).toEqual(batchArtifacts(artifacts, options));
|
|
41
|
-
});
|
|
42
|
-
|
|
43
|
-
test("extractWithPrompt builds user content and returns result", async () => {
|
|
44
|
-
const artifacts: Artifact[] = [
|
|
45
|
-
{
|
|
46
|
-
id: "a1",
|
|
47
|
-
type: "image",
|
|
48
|
-
raw: async () => Buffer.from(""),
|
|
49
|
-
contents: [
|
|
50
|
-
{
|
|
51
|
-
text: "hello",
|
|
52
|
-
media: [{ type: "image", base64: "abc" }],
|
|
53
|
-
},
|
|
54
|
-
],
|
|
55
|
-
},
|
|
56
|
-
];
|
|
57
|
-
|
|
58
|
-
let receivedUser: unknown;
|
|
59
|
-
const result = await extractWithPrompt<Output>({
|
|
60
|
-
model: {},
|
|
61
|
-
schema,
|
|
62
|
-
system: "sys",
|
|
63
|
-
user: "prompt",
|
|
64
|
-
artifacts,
|
|
65
|
-
execute: async (request) => {
|
|
66
|
-
receivedUser = request.user;
|
|
67
|
-
return {
|
|
68
|
-
data: { title: "ok" },
|
|
69
|
-
usage: { inputTokens: 1, outputTokens: 2, totalTokens: 3 },
|
|
70
|
-
};
|
|
71
|
-
},
|
|
72
|
-
});
|
|
73
|
-
|
|
74
|
-
expect(Array.isArray(receivedUser)).toBe(true);
|
|
75
|
-
expect(result.data.title).toBe("ok");
|
|
76
|
-
});
|
package/src/strategies/utils.ts
DELETED
|
@@ -1,95 +0,0 @@
|
|
|
1
|
-
import type { Artifact, ExtractionEvents, Usage, TelemetryAdapter } from "../types";
|
|
2
|
-
import type { DebugLogger } from "../debug/logger";
|
|
3
|
-
import { batchArtifacts, type BatchOptions } from "../chunking/ArtifactBatcher";
|
|
4
|
-
import { buildUserContent } from "../llm/message";
|
|
5
|
-
import { runWithRetries } from "../llm/RetryingRunner";
|
|
6
|
-
|
|
7
|
-
export const serializeSchema = (schema: unknown) => {
|
|
8
|
-
return JSON.stringify(schema);
|
|
9
|
-
};
|
|
10
|
-
|
|
11
|
-
export const mergeUsage = (usages: Usage[]) => {
|
|
12
|
-
return usages.reduce(
|
|
13
|
-
(acc, usage) => ({
|
|
14
|
-
inputTokens: acc.inputTokens + usage.inputTokens,
|
|
15
|
-
outputTokens: acc.outputTokens + usage.outputTokens,
|
|
16
|
-
totalTokens: acc.totalTokens + usage.totalTokens,
|
|
17
|
-
}),
|
|
18
|
-
{ inputTokens: 0, outputTokens: 0, totalTokens: 0 }
|
|
19
|
-
);
|
|
20
|
-
};
|
|
21
|
-
|
|
22
|
-
export const getBatches = (
|
|
23
|
-
artifacts: Artifact[],
|
|
24
|
-
options: BatchOptions,
|
|
25
|
-
debug?: DebugLogger,
|
|
26
|
-
telemetry?: TelemetryAdapter,
|
|
27
|
-
parentSpan?: { id: string; traceId: string; name: string; kind: string; startTime: number; parentId?: string }
|
|
28
|
-
) => {
|
|
29
|
-
// Create chunking span if telemetry is enabled
|
|
30
|
-
const chunkingSpan = telemetry?.startSpan({
|
|
31
|
-
name: "struktur.chunking",
|
|
32
|
-
kind: "RETRIEVER",
|
|
33
|
-
parentSpan,
|
|
34
|
-
attributes: {
|
|
35
|
-
"chunking.artifact_count": artifacts.length,
|
|
36
|
-
"chunking.max_tokens": options.maxTokens,
|
|
37
|
-
"chunking.max_images": options.maxImages,
|
|
38
|
-
},
|
|
39
|
-
});
|
|
40
|
-
|
|
41
|
-
const batches = batchArtifacts(artifacts, { ...options, debug });
|
|
42
|
-
|
|
43
|
-
// Record chunking results
|
|
44
|
-
if (chunkingSpan && telemetry) {
|
|
45
|
-
batches.forEach((batch, index) => {
|
|
46
|
-
telemetry.recordEvent(chunkingSpan, {
|
|
47
|
-
type: "chunk",
|
|
48
|
-
chunkIndex: index,
|
|
49
|
-
totalChunks: batches.length,
|
|
50
|
-
tokens: batch.reduce((sum, a) => sum + (a.tokens || 0), 0),
|
|
51
|
-
images: batch.reduce((sum, a) =>
|
|
52
|
-
sum + (a.contents?.flatMap((c) => c.media || []).length || 0), 0),
|
|
53
|
-
});
|
|
54
|
-
});
|
|
55
|
-
|
|
56
|
-
telemetry.endSpan(chunkingSpan, {
|
|
57
|
-
status: "ok",
|
|
58
|
-
output: { batchCount: batches.length },
|
|
59
|
-
});
|
|
60
|
-
}
|
|
61
|
-
|
|
62
|
-
return batches;
|
|
63
|
-
};
|
|
64
|
-
|
|
65
|
-
export const extractWithPrompt = async <T>(options: {
|
|
66
|
-
model: unknown;
|
|
67
|
-
schema: unknown;
|
|
68
|
-
system: string;
|
|
69
|
-
user: string;
|
|
70
|
-
artifacts: Artifact[];
|
|
71
|
-
events?: ExtractionEvents;
|
|
72
|
-
execute?: typeof runWithRetries<T>;
|
|
73
|
-
strict?: boolean;
|
|
74
|
-
debug?: DebugLogger;
|
|
75
|
-
callId?: string;
|
|
76
|
-
telemetry?: TelemetryAdapter;
|
|
77
|
-
parentSpan?: { id: string; traceId: string; name: string; kind: string; startTime: number; parentId?: string };
|
|
78
|
-
}) => {
|
|
79
|
-
const userContent = buildUserContent(options.user, options.artifacts);
|
|
80
|
-
const result = await runWithRetries<T>({
|
|
81
|
-
model: options.model,
|
|
82
|
-
schema: options.schema,
|
|
83
|
-
system: options.system,
|
|
84
|
-
user: userContent,
|
|
85
|
-
events: options.events,
|
|
86
|
-
execute: options.execute,
|
|
87
|
-
strict: options.strict,
|
|
88
|
-
debug: options.debug,
|
|
89
|
-
callId: options.callId,
|
|
90
|
-
telemetry: options.telemetry,
|
|
91
|
-
parentSpan: options.parentSpan,
|
|
92
|
-
});
|
|
93
|
-
|
|
94
|
-
return result;
|
|
95
|
-
};
|
package/src/tokenization.test.ts
DELETED
|
@@ -1,119 +0,0 @@
|
|
|
1
|
-
import { test, expect } from "bun:test";
|
|
2
|
-
import type { Artifact } from "./types";
|
|
3
|
-
import {
|
|
4
|
-
countArtifactTokens,
|
|
5
|
-
estimateTextTokens,
|
|
6
|
-
estimateImageTokens,
|
|
7
|
-
countContentTokens,
|
|
8
|
-
} from "./tokenization";
|
|
9
|
-
|
|
10
|
-
test("estimateTextTokens uses default ratio", () => {
|
|
11
|
-
expect(estimateTextTokens("abcd")).toBe(1);
|
|
12
|
-
expect(estimateTextTokens("abcdefgh")).toBe(2);
|
|
13
|
-
});
|
|
14
|
-
|
|
15
|
-
test("estimateTextTokens uses custom ratio", () => {
|
|
16
|
-
expect(estimateTextTokens("abcd", { textTokenRatio: 2 })).toBe(2);
|
|
17
|
-
expect(estimateTextTokens("abcdefgh", { textTokenRatio: 2 })).toBe(4);
|
|
18
|
-
});
|
|
19
|
-
|
|
20
|
-
test("estimateImageTokens uses default image tokens", () => {
|
|
21
|
-
expect(estimateImageTokens({ type: "image" })).toBe(1000);
|
|
22
|
-
});
|
|
23
|
-
|
|
24
|
-
test("estimateImageTokens uses custom image tokens", () => {
|
|
25
|
-
expect(estimateImageTokens({ type: "image" }, { defaultImageTokens: 500 })).toBe(500);
|
|
26
|
-
});
|
|
27
|
-
|
|
28
|
-
test("countArtifactTokens honors artifact.tokens override", () => {
|
|
29
|
-
const artifact: Artifact = {
|
|
30
|
-
id: "a1",
|
|
31
|
-
type: "text",
|
|
32
|
-
raw: async () => Buffer.from(""),
|
|
33
|
-
contents: [{ text: "hello" }],
|
|
34
|
-
tokens: 42,
|
|
35
|
-
};
|
|
36
|
-
|
|
37
|
-
expect(countArtifactTokens(artifact)).toBe(42);
|
|
38
|
-
});
|
|
39
|
-
|
|
40
|
-
test("countArtifactTokens sums text and media tokens", () => {
|
|
41
|
-
const artifact: Artifact = {
|
|
42
|
-
id: "a2",
|
|
43
|
-
type: "text",
|
|
44
|
-
raw: async () => Buffer.from(""),
|
|
45
|
-
contents: [
|
|
46
|
-
{
|
|
47
|
-
text: "abcdefgh",
|
|
48
|
-
media: [{ type: "image" }],
|
|
49
|
-
},
|
|
50
|
-
],
|
|
51
|
-
};
|
|
52
|
-
|
|
53
|
-
const tokens = countArtifactTokens(artifact);
|
|
54
|
-
expect(tokens).toBe(1002);
|
|
55
|
-
});
|
|
56
|
-
|
|
57
|
-
test("countContentTokens counts text tokens", () => {
|
|
58
|
-
const tokens = countContentTokens({ text: "abcdefgh" });
|
|
59
|
-
expect(tokens).toBe(2);
|
|
60
|
-
});
|
|
61
|
-
|
|
62
|
-
test("countContentTokens counts image tokens", () => {
|
|
63
|
-
const tokens = countContentTokens({ media: [{ type: "image" }] });
|
|
64
|
-
expect(tokens).toBe(1000);
|
|
65
|
-
});
|
|
66
|
-
|
|
67
|
-
test("countContentTokens counts image text tokens", () => {
|
|
68
|
-
const tokens = countContentTokens({
|
|
69
|
-
media: [{ type: "image", text: "abcd" }],
|
|
70
|
-
});
|
|
71
|
-
expect(tokens).toBe(1001);
|
|
72
|
-
});
|
|
73
|
-
|
|
74
|
-
test("countContentTokens sums multiple images", () => {
|
|
75
|
-
const tokens = countContentTokens({
|
|
76
|
-
media: [{ type: "image" }, { type: "image" }],
|
|
77
|
-
});
|
|
78
|
-
expect(tokens).toBe(2000);
|
|
79
|
-
});
|
|
80
|
-
|
|
81
|
-
test("countContentTokens handles empty content", () => {
|
|
82
|
-
const tokens = countContentTokens({});
|
|
83
|
-
expect(tokens).toBe(0);
|
|
84
|
-
});
|
|
85
|
-
|
|
86
|
-
test("countArtifactTokens sums multiple contents", () => {
|
|
87
|
-
const artifact: Artifact = {
|
|
88
|
-
id: "a1",
|
|
89
|
-
type: "text",
|
|
90
|
-
raw: async () => Buffer.from(""),
|
|
91
|
-
contents: [
|
|
92
|
-
{ text: "abcd" },
|
|
93
|
-
{ text: "efgh" },
|
|
94
|
-
{ media: [{ type: "image" }] },
|
|
95
|
-
],
|
|
96
|
-
};
|
|
97
|
-
|
|
98
|
-
const tokens = countArtifactTokens(artifact);
|
|
99
|
-
expect(tokens).toBe(1002);
|
|
100
|
-
});
|
|
101
|
-
|
|
102
|
-
test("countArtifactTokens with custom options", () => {
|
|
103
|
-
const artifact: Artifact = {
|
|
104
|
-
id: "a1",
|
|
105
|
-
type: "text",
|
|
106
|
-
raw: async () => Buffer.from(""),
|
|
107
|
-
contents: [
|
|
108
|
-
{ text: "abcdefgh" },
|
|
109
|
-
{ media: [{ type: "image" }] },
|
|
110
|
-
],
|
|
111
|
-
};
|
|
112
|
-
|
|
113
|
-
const tokens = countArtifactTokens(artifact, {
|
|
114
|
-
textTokenRatio: 2,
|
|
115
|
-
defaultImageTokens: 500,
|
|
116
|
-
});
|
|
117
|
-
|
|
118
|
-
expect(tokens).toBe(504);
|
|
119
|
-
});
|
package/src/tokenization.ts
DELETED
|
@@ -1,71 +0,0 @@
|
|
|
1
|
-
import type { Artifact, ArtifactContent, ArtifactImage } from "./types";
|
|
2
|
-
|
|
3
|
-
export type TokenCountOptions = {
|
|
4
|
-
textTokenRatio?: number;
|
|
5
|
-
defaultImageTokens?: number;
|
|
6
|
-
};
|
|
7
|
-
|
|
8
|
-
const defaultOptions: Required<TokenCountOptions> = {
|
|
9
|
-
textTokenRatio: 4,
|
|
10
|
-
defaultImageTokens: 1000,
|
|
11
|
-
};
|
|
12
|
-
|
|
13
|
-
const mergeOptions = (options?: TokenCountOptions) => ({
|
|
14
|
-
...defaultOptions,
|
|
15
|
-
...(options ?? {}),
|
|
16
|
-
});
|
|
17
|
-
|
|
18
|
-
export const estimateTextTokens = (text: string, options?: TokenCountOptions) => {
|
|
19
|
-
const { textTokenRatio } = mergeOptions(options);
|
|
20
|
-
return Math.ceil(text.length / textTokenRatio);
|
|
21
|
-
};
|
|
22
|
-
|
|
23
|
-
export const estimateImageTokens = (
|
|
24
|
-
_image: ArtifactImage,
|
|
25
|
-
options?: TokenCountOptions
|
|
26
|
-
) => {
|
|
27
|
-
const { defaultImageTokens } = mergeOptions(options);
|
|
28
|
-
return defaultImageTokens;
|
|
29
|
-
};
|
|
30
|
-
|
|
31
|
-
export const countContentTokens = (
|
|
32
|
-
content: ArtifactContent,
|
|
33
|
-
options?: TokenCountOptions
|
|
34
|
-
) => {
|
|
35
|
-
let tokens = 0;
|
|
36
|
-
|
|
37
|
-
if (content.text) {
|
|
38
|
-
tokens += estimateTextTokens(content.text, options);
|
|
39
|
-
}
|
|
40
|
-
|
|
41
|
-
if (content.media?.length) {
|
|
42
|
-
for (const media of content.media) {
|
|
43
|
-
tokens += estimateImageTokens(media, options);
|
|
44
|
-
if (media.text) {
|
|
45
|
-
tokens += estimateTextTokens(media.text, options);
|
|
46
|
-
}
|
|
47
|
-
}
|
|
48
|
-
}
|
|
49
|
-
|
|
50
|
-
return tokens;
|
|
51
|
-
};
|
|
52
|
-
|
|
53
|
-
export const countArtifactTokens = (
|
|
54
|
-
artifact: Artifact,
|
|
55
|
-
options?: TokenCountOptions
|
|
56
|
-
) => {
|
|
57
|
-
if (typeof artifact.tokens === "number") {
|
|
58
|
-
return artifact.tokens;
|
|
59
|
-
}
|
|
60
|
-
|
|
61
|
-
return artifact.contents.reduce(
|
|
62
|
-
(total, content) => total + countContentTokens(content, options),
|
|
63
|
-
0
|
|
64
|
-
);
|
|
65
|
-
};
|
|
66
|
-
|
|
67
|
-
export const countArtifactImages = (artifact: Artifact) => {
|
|
68
|
-
return artifact.contents.reduce((count, content) => {
|
|
69
|
-
return count + (content.media?.length ?? 0);
|
|
70
|
-
}, 0);
|
|
71
|
-
};
|
package/src/types.test.ts
DELETED
|
@@ -1,25 +0,0 @@
|
|
|
1
|
-
import { test, expect } from "bun:test";
|
|
2
|
-
import type { Artifact, ExtractionResult, ExtractionStrategy, Usage } from "./types";
|
|
3
|
-
|
|
4
|
-
test("types can be used to build core DTOs", async () => {
|
|
5
|
-
const usage: Usage = { inputTokens: 1, outputTokens: 2, totalTokens: 3 };
|
|
6
|
-
const artifact: Artifact = {
|
|
7
|
-
id: "a1",
|
|
8
|
-
type: "text",
|
|
9
|
-
raw: async () => Buffer.from(""),
|
|
10
|
-
contents: [{ text: "hello" }],
|
|
11
|
-
};
|
|
12
|
-
|
|
13
|
-
const strategy: ExtractionStrategy<{ title: string }> = {
|
|
14
|
-
name: "test",
|
|
15
|
-
run: async () => ({ data: { title: "ok" }, usage }),
|
|
16
|
-
};
|
|
17
|
-
|
|
18
|
-
const result: ExtractionResult<{ title: string }> = await strategy.run({
|
|
19
|
-
artifacts: [artifact],
|
|
20
|
-
schema: { type: "object" },
|
|
21
|
-
strategy,
|
|
22
|
-
});
|
|
23
|
-
|
|
24
|
-
expect(result.data.title).toBe("ok");
|
|
25
|
-
});
|
package/src/types.ts
DELETED
|
@@ -1,174 +0,0 @@
|
|
|
1
|
-
import type { JSONSchemaType } from "ajv";
|
|
2
|
-
import type { DebugLogger } from "./debug/logger";
|
|
3
|
-
|
|
4
|
-
export type ArtifactType = "text" | "image" | "pdf" | "file";
|
|
5
|
-
|
|
6
|
-
export type ImageType = "embedded" | "screenshot";
|
|
7
|
-
|
|
8
|
-
export type ArtifactImage = {
|
|
9
|
-
type: "image";
|
|
10
|
-
url?: string;
|
|
11
|
-
base64?: string;
|
|
12
|
-
contents?: Buffer;
|
|
13
|
-
text?: string;
|
|
14
|
-
x?: number;
|
|
15
|
-
y?: number;
|
|
16
|
-
width?: number;
|
|
17
|
-
height?: number;
|
|
18
|
-
imageType?: ImageType;
|
|
19
|
-
};
|
|
20
|
-
|
|
21
|
-
export type ArtifactContent = {
|
|
22
|
-
page?: number;
|
|
23
|
-
text?: string;
|
|
24
|
-
media?: ArtifactImage[];
|
|
25
|
-
};
|
|
26
|
-
|
|
27
|
-
export interface Artifact {
|
|
28
|
-
id: string;
|
|
29
|
-
type: ArtifactType;
|
|
30
|
-
raw: () => Promise<Buffer>;
|
|
31
|
-
contents: ArtifactContent[];
|
|
32
|
-
metadata?: Record<string, unknown>;
|
|
33
|
-
tokens?: number;
|
|
34
|
-
}
|
|
35
|
-
|
|
36
|
-
export type Usage = {
|
|
37
|
-
inputTokens: number;
|
|
38
|
-
outputTokens: number;
|
|
39
|
-
totalTokens: number;
|
|
40
|
-
};
|
|
41
|
-
|
|
42
|
-
export type ExtractionResult<T> = {
|
|
43
|
-
data: T;
|
|
44
|
-
usage: Usage;
|
|
45
|
-
error?: Error;
|
|
46
|
-
};
|
|
47
|
-
|
|
48
|
-
/**
|
|
49
|
-
* Telemetry adapter interface for tracing extraction operations.
|
|
50
|
-
* This is a minimal interface that matches the full TelemetryAdapter from @struktur/telemetry.
|
|
51
|
-
* SDK users should import adapters from @struktur/telemetry package.
|
|
52
|
-
*/
|
|
53
|
-
export interface TelemetryAdapter {
|
|
54
|
-
readonly name: string;
|
|
55
|
-
readonly version: string;
|
|
56
|
-
initialize(): Promise<void>;
|
|
57
|
-
shutdown(): Promise<void>;
|
|
58
|
-
startSpan(context: {
|
|
59
|
-
name: string;
|
|
60
|
-
kind: "CHAIN" | "LLM" | "TOOL" | "AGENT" | "RETRIEVER" | "EMBEDDING" | "RERANKER";
|
|
61
|
-
parentSpan?: { id: string; traceId: string };
|
|
62
|
-
attributes?: Record<string, unknown>;
|
|
63
|
-
startTime?: number;
|
|
64
|
-
}): { id: string; traceId: string; name: string; kind: string; startTime: number; parentId?: string };
|
|
65
|
-
endSpan(span: { id: string }, result?: { status: "ok" | "error"; error?: Error; output?: unknown; latencyMs?: number }): void;
|
|
66
|
-
recordEvent(span: { id: string }, event: unknown): void;
|
|
67
|
-
setAttributes(span: { id: string }, attributes: Record<string, unknown>): void;
|
|
68
|
-
setContext(context: { sessionId?: string; userId?: string; metadata?: Record<string, unknown>; tags?: string[] }): void;
|
|
69
|
-
}
|
|
70
|
-
|
|
71
|
-
export type StepInfo = {
|
|
72
|
-
step: number;
|
|
73
|
-
total?: number;
|
|
74
|
-
label?: string;
|
|
75
|
-
detail?: string;
|
|
76
|
-
};
|
|
77
|
-
|
|
78
|
-
export type ProgressInfo = {
|
|
79
|
-
current: number;
|
|
80
|
-
total: number;
|
|
81
|
-
percent?: number;
|
|
82
|
-
};
|
|
83
|
-
|
|
84
|
-
export type MessageInfo = {
|
|
85
|
-
role: "system" | "user" | "assistant" | "tool";
|
|
86
|
-
content: unknown;
|
|
87
|
-
};
|
|
88
|
-
|
|
89
|
-
export type TokenUsageInfo = Usage & {
|
|
90
|
-
model?: string;
|
|
91
|
-
};
|
|
92
|
-
|
|
93
|
-
export type RetryInfo = {
|
|
94
|
-
attempt: number;
|
|
95
|
-
maxAttempts: number;
|
|
96
|
-
reason?: string;
|
|
97
|
-
};
|
|
98
|
-
|
|
99
|
-
export type AgentToolStartInfo = {
|
|
100
|
-
toolName: string;
|
|
101
|
-
toolCallId: string;
|
|
102
|
-
args: Record<string, unknown>;
|
|
103
|
-
};
|
|
104
|
-
|
|
105
|
-
export type AgentToolEndInfo = {
|
|
106
|
-
toolCallId: string;
|
|
107
|
-
result?: Record<string, unknown>;
|
|
108
|
-
error?: string;
|
|
109
|
-
};
|
|
110
|
-
|
|
111
|
-
export type AgentMessageInfo = {
|
|
112
|
-
content: string;
|
|
113
|
-
role?: "assistant" | "user";
|
|
114
|
-
};
|
|
115
|
-
|
|
116
|
-
export type AgentReasoningInfo = {
|
|
117
|
-
thought: string;
|
|
118
|
-
};
|
|
119
|
-
|
|
120
|
-
export type AgentEvents = {
|
|
121
|
-
onAgentToolStart?: (info: AgentToolStartInfo) => void | Promise<void>;
|
|
122
|
-
onAgentToolEnd?: (info: AgentToolEndInfo) => void | Promise<void>;
|
|
123
|
-
onAgentMessage?: (info: AgentMessageInfo) => void | Promise<void>;
|
|
124
|
-
onAgentReasoning?: (info: AgentReasoningInfo) => void | Promise<void>;
|
|
125
|
-
};
|
|
126
|
-
|
|
127
|
-
export type ExtractionEvents = {
|
|
128
|
-
onStep?: (info: StepInfo) => void | Promise<void>;
|
|
129
|
-
onMessage?: (info: MessageInfo) => void | Promise<void>;
|
|
130
|
-
onProgress?: (info: ProgressInfo) => void | Promise<void>;
|
|
131
|
-
onTokenUsage?: (info: TokenUsageInfo) => void | Promise<void>;
|
|
132
|
-
onRetry?: (info: RetryInfo) => void | Promise<void>;
|
|
133
|
-
} & AgentEvents;
|
|
134
|
-
|
|
135
|
-
export type AnyJSONSchema = Record<string, unknown>;
|
|
136
|
-
export type TypedJSONSchema<T> = JSONSchemaType<T>;
|
|
137
|
-
|
|
138
|
-
export type ProviderModelsResult = {
|
|
139
|
-
provider: string;
|
|
140
|
-
ok: boolean;
|
|
141
|
-
models?: string[];
|
|
142
|
-
error?: string;
|
|
143
|
-
};
|
|
144
|
-
|
|
145
|
-
export type ExtractionOptions<T> = {
|
|
146
|
-
artifacts: Artifact[];
|
|
147
|
-
/**
|
|
148
|
-
* JSON Schema for the extracted output.
|
|
149
|
-
* Exactly one of `schema`, `fields`, or an inline schema via the CLI must be provided.
|
|
150
|
-
*/
|
|
151
|
-
schema?: TypedJSONSchema<T> | AnyJSONSchema;
|
|
152
|
-
/**
|
|
153
|
-
* Shorthand schema definition as a comma-separated string of `name` or `name:type` tokens.
|
|
154
|
-
* E.g. `"title, price:number"`. Defaults to `string` when no type is specified.
|
|
155
|
-
* Mutually exclusive with `schema`.
|
|
156
|
-
*/
|
|
157
|
-
fields?: string;
|
|
158
|
-
strategy: ExtractionStrategy<T>;
|
|
159
|
-
events?: ExtractionEvents;
|
|
160
|
-
debug?: DebugLogger;
|
|
161
|
-
strict?: boolean;
|
|
162
|
-
/**
|
|
163
|
-
* Telemetry adapter for tracing extraction operations.
|
|
164
|
-
* Supports Phoenix (Arize), Langfuse, and other OpenTelemetry-compatible providers.
|
|
165
|
-
* Import from `@struktur/telemetry` package and pass the adapter here.
|
|
166
|
-
*/
|
|
167
|
-
telemetry?: TelemetryAdapter | null;
|
|
168
|
-
}
|
|
169
|
-
|
|
170
|
-
export interface ExtractionStrategy<T> {
|
|
171
|
-
name: string;
|
|
172
|
-
run(options: ExtractionOptions<T>): Promise<ExtractionResult<T>>;
|
|
173
|
-
getEstimatedSteps?: (artifacts: Artifact[]) => number;
|
|
174
|
-
}
|
package/src/validation/AGENTS.md
DELETED
|
@@ -1,7 +0,0 @@
|
|
|
1
|
-
Validation module
|
|
2
|
-
|
|
3
|
-
- Purpose: Schema validation and error shaping.
|
|
4
|
-
- Key files: `validator.ts`.
|
|
5
|
-
- Design: `validateOrThrow` compiles schemas and throws `SchemaValidationError` on failure; `createAjv` registers `ajv-formats` for common schema formats and adds custom `artifact-id` format for referencing images in artifacts.
|
|
6
|
-
- Custom formats: `artifact-id` validates strings matching pattern `artifact:ID/images/imageNUM.EXT` (e.g., `artifact:123456/images/image1.jpg`).
|
|
7
|
-
- Tests: `validator.test.ts`.
|