@struktur/sdk 1.2.1 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -81,6 +81,19 @@ export class SequentialAutoMergeStrategy<T> implements ExtractionStrategy<T> {
81
81
 
82
82
  async run(options: ExtractionOptions<T>): Promise<ExtractionResult<T>> {
83
83
  const debug = options.debug;
84
+ const { telemetry } = options;
85
+
86
+ // Create strategy-level span
87
+ const strategySpan = telemetry?.startSpan({
88
+ name: "strategy.sequential-auto-merge",
89
+ kind: "CHAIN",
90
+ attributes: {
91
+ "strategy.name": this.name,
92
+ "strategy.artifacts.count": options.artifacts.length,
93
+ "strategy.chunk_size": this.config.chunkSize,
94
+ },
95
+ });
96
+
84
97
  const batches = getBatches(
85
98
  options.artifacts,
86
99
  {
@@ -88,6 +101,8 @@ export class SequentialAutoMergeStrategy<T> implements ExtractionStrategy<T> {
88
101
  maxImages: this.config.maxImages,
89
102
  },
90
103
  debug,
104
+ telemetry ?? undefined,
105
+ strategySpan,
91
106
  );
92
107
 
93
108
  const schema = serializeSchema(options.schema);
@@ -104,6 +119,17 @@ export class SequentialAutoMergeStrategy<T> implements ExtractionStrategy<T> {
104
119
  inputCount: batches.length,
105
120
  strategy: this.name,
106
121
  });
122
+
123
+ // Create smart merge span
124
+ const mergeSpan = telemetry?.startSpan({
125
+ name: "struktur.smart_merge",
126
+ kind: "CHAIN",
127
+ parentSpan: strategySpan,
128
+ attributes: {
129
+ "merge.strategy": "smart",
130
+ "merge.input_count": batches.length,
131
+ },
132
+ });
107
133
 
108
134
  for (const [index, batch] of batches.entries()) {
109
135
  const prompt = buildExtractorPrompt(
@@ -122,6 +148,8 @@ export class SequentialAutoMergeStrategy<T> implements ExtractionStrategy<T> {
122
148
  strict: options.strict ?? this.config.strict,
123
149
  debug,
124
150
  callId: `sequential_auto_batch_${index + 1}`,
151
+ telemetry: telemetry ?? undefined,
152
+ parentSpan: mergeSpan,
125
153
  });
126
154
 
127
155
  merged = merger.merge(merged, result.data as Record<string, unknown>);
@@ -145,6 +173,16 @@ export class SequentialAutoMergeStrategy<T> implements ExtractionStrategy<T> {
145
173
  leftCount: leftArray,
146
174
  rightCount: rightArray,
147
175
  });
176
+
177
+ // Record merge event in telemetry
178
+ if (mergeSpan && telemetry) {
179
+ telemetry.recordEvent(mergeSpan, {
180
+ type: "merge",
181
+ strategy: "smart",
182
+ inputCount: rightArray ?? 1,
183
+ outputCount: leftArray ?? 1,
184
+ });
185
+ }
148
186
  }
149
187
 
150
188
  step += 1;
@@ -162,8 +200,40 @@ export class SequentialAutoMergeStrategy<T> implements ExtractionStrategy<T> {
162
200
  }
163
201
 
164
202
  debug?.mergeComplete({ mergeId: "sequential_auto_merge", success: true });
203
+
204
+ // End merge span
205
+ if (mergeSpan && telemetry) {
206
+ telemetry.endSpan(mergeSpan, {
207
+ status: "ok",
208
+ output: merged,
209
+ });
210
+ }
165
211
 
166
212
  merged = dedupeArrays(merged);
213
+
214
+ // Create exact dedupe span
215
+ const exactDedupeSpan = telemetry?.startSpan({
216
+ name: "struktur.exact_dedupe",
217
+ kind: "CHAIN",
218
+ parentSpan: strategySpan,
219
+ attributes: {
220
+ "dedupe.method": "exact_hashing",
221
+ },
222
+ });
223
+
224
+ // End exact dedupe span
225
+ if (exactDedupeSpan && telemetry) {
226
+ telemetry.recordEvent(exactDedupeSpan, {
227
+ type: "merge",
228
+ strategy: "exact_hash_dedupe",
229
+ inputCount: Object.keys(merged).length,
230
+ outputCount: Object.keys(merged).length,
231
+ });
232
+ telemetry.endSpan(exactDedupeSpan, {
233
+ status: "ok",
234
+ output: merged,
235
+ });
236
+ }
167
237
 
168
238
  const dedupePrompt = buildDeduplicationPrompt(schema, merged);
169
239
 
@@ -171,6 +241,16 @@ export class SequentialAutoMergeStrategy<T> implements ExtractionStrategy<T> {
171
241
  dedupeId: "sequential_auto_dedupe",
172
242
  itemCount: Object.keys(merged).length,
173
243
  });
244
+
245
+ // Create LLM dedupe span
246
+ const llmDedupeSpan = telemetry?.startSpan({
247
+ name: "struktur.llm_dedupe",
248
+ kind: "CHAIN",
249
+ parentSpan: strategySpan,
250
+ attributes: {
251
+ "dedupe.method": "llm",
252
+ },
253
+ });
174
254
 
175
255
  const dedupeResponse = await runWithRetries<{ keys: string[] }>({
176
256
  model: this.config.dedupeModel ?? this.config.model,
@@ -182,6 +262,8 @@ export class SequentialAutoMergeStrategy<T> implements ExtractionStrategy<T> {
182
262
  strict: this.config.strict,
183
263
  debug,
184
264
  callId: "sequential_auto_dedupe",
265
+ telemetry: telemetry ?? undefined,
266
+ parentSpan: llmDedupeSpan,
185
267
  });
186
268
 
187
269
  step += 1;
@@ -207,6 +289,27 @@ export class SequentialAutoMergeStrategy<T> implements ExtractionStrategy<T> {
207
289
  duplicatesFound: dedupeResponse.data.keys.length,
208
290
  itemsRemoved: dedupeResponse.data.keys.length,
209
291
  });
292
+
293
+ // End LLM dedupe span
294
+ if (llmDedupeSpan && telemetry) {
295
+ telemetry.recordEvent(llmDedupeSpan, {
296
+ type: "merge",
297
+ strategy: "llm_dedupe",
298
+ inputCount: Object.keys(merged).length,
299
+ outputCount: Object.keys(deduped).length,
300
+ deduped: dedupeResponse.data.keys.length,
301
+ });
302
+ telemetry.endSpan(llmDedupeSpan, {
303
+ status: "ok",
304
+ output: deduped,
305
+ });
306
+ }
307
+
308
+ // End strategy span
309
+ telemetry?.endSpan(strategySpan!, {
310
+ status: "ok",
311
+ output: deduped,
312
+ });
210
313
 
211
314
  return {
212
315
  data: deduped as T,
@@ -36,6 +36,19 @@ export class SequentialStrategy<T> implements ExtractionStrategy<T> {
36
36
 
37
37
  async run(options: ExtractionOptions<T>): Promise<ExtractionResult<T>> {
38
38
  const debug = options.debug;
39
+ const { telemetry } = options;
40
+
41
+ // Create strategy-level span
42
+ const strategySpan = telemetry?.startSpan({
43
+ name: "strategy.sequential",
44
+ kind: "CHAIN",
45
+ attributes: {
46
+ "strategy.name": this.name,
47
+ "strategy.artifacts.count": options.artifacts.length,
48
+ "strategy.chunk_size": this.config.chunkSize,
49
+ },
50
+ });
51
+
39
52
  const batches = getBatches(
40
53
  options.artifacts,
41
54
  {
@@ -43,6 +56,8 @@ export class SequentialStrategy<T> implements ExtractionStrategy<T> {
43
56
  maxImages: this.config.maxImages,
44
57
  },
45
58
  debug,
59
+ telemetry ?? undefined,
60
+ strategySpan,
46
61
  );
47
62
 
48
63
  const schema = serializeSchema(options.schema);
@@ -84,6 +99,8 @@ export class SequentialStrategy<T> implements ExtractionStrategy<T> {
84
99
  strict: options.strict ?? this.config.strict,
85
100
  debug,
86
101
  callId: `sequential_batch_${index + 1}`,
102
+ telemetry: telemetry ?? undefined,
103
+ parentSpan: strategySpan,
87
104
  });
88
105
 
89
106
  currentData = result.data;
@@ -110,6 +127,12 @@ export class SequentialStrategy<T> implements ExtractionStrategy<T> {
110
127
  throw new Error("No data extracted from sequential strategy");
111
128
  }
112
129
 
130
+ // End strategy span
131
+ telemetry?.endSpan(strategySpan!, {
132
+ status: "ok",
133
+ output: currentData,
134
+ });
135
+
113
136
  return { data: currentData, usage: mergeUsage(usages) };
114
137
  }
115
138
  }
@@ -25,6 +25,18 @@ export class SimpleStrategy<T> implements ExtractionStrategy<T> {
25
25
 
26
26
  async run(options: ExtractionOptions<T>): Promise<ExtractionResult<T>> {
27
27
  const debug = options.debug;
28
+ const { telemetry } = options;
29
+
30
+ // Create strategy-level span
31
+ const strategySpan = telemetry?.startSpan({
32
+ name: "strategy.simple",
33
+ kind: "CHAIN",
34
+ attributes: {
35
+ "strategy.name": this.name,
36
+ "strategy.artifacts.count": options.artifacts.length,
37
+ },
38
+ });
39
+
28
40
  const schema = serializeSchema(options.schema);
29
41
  const { system, user } = buildExtractorPrompt(
30
42
  options.artifacts,
@@ -56,6 +68,8 @@ export class SimpleStrategy<T> implements ExtractionStrategy<T> {
56
68
  strict: options.strict ?? this.config.strict,
57
69
  debug,
58
70
  callId: "simple_extract",
71
+ telemetry,
72
+ parentSpan: strategySpan,
59
73
  });
60
74
 
61
75
  debug?.step({
@@ -65,6 +79,12 @@ export class SimpleStrategy<T> implements ExtractionStrategy<T> {
65
79
  strategy: this.name,
66
80
  });
67
81
 
82
+ // End strategy span
83
+ telemetry?.endSpan(strategySpan!, {
84
+ status: "ok",
85
+ output: result.data,
86
+ });
87
+
68
88
  return { data: result.data, usage: result.usage };
69
89
  }
70
90
  }
@@ -1,4 +1,4 @@
1
- import type { Artifact, ExtractionEvents, Usage } from "../types";
1
+ import type { Artifact, ExtractionEvents, Usage, TelemetryAdapter } from "../types";
2
2
  import type { DebugLogger } from "../debug/logger";
3
3
  import { batchArtifacts, type BatchOptions } from "../chunking/ArtifactBatcher";
4
4
  import { buildUserContent } from "../llm/message";
@@ -22,9 +22,44 @@ export const mergeUsage = (usages: Usage[]) => {
22
22
  export const getBatches = (
23
23
  artifacts: Artifact[],
24
24
  options: BatchOptions,
25
- debug?: DebugLogger
25
+ debug?: DebugLogger,
26
+ telemetry?: TelemetryAdapter,
27
+ parentSpan?: { id: string; traceId: string; name: string; kind: string; startTime: number; parentId?: string }
26
28
  ) => {
27
- return batchArtifacts(artifacts, { ...options, debug });
29
+ // Create chunking span if telemetry is enabled
30
+ const chunkingSpan = telemetry?.startSpan({
31
+ name: "struktur.chunking",
32
+ kind: "RETRIEVER",
33
+ parentSpan,
34
+ attributes: {
35
+ "chunking.artifact_count": artifacts.length,
36
+ "chunking.max_tokens": options.maxTokens,
37
+ "chunking.max_images": options.maxImages,
38
+ },
39
+ });
40
+
41
+ const batches = batchArtifacts(artifacts, { ...options, debug });
42
+
43
+ // Record chunking results
44
+ if (chunkingSpan && telemetry) {
45
+ batches.forEach((batch, index) => {
46
+ telemetry.recordEvent(chunkingSpan, {
47
+ type: "chunk",
48
+ chunkIndex: index,
49
+ totalChunks: batches.length,
50
+ tokens: batch.reduce((sum, a) => sum + (a.tokens || 0), 0),
51
+ images: batch.reduce((sum, a) =>
52
+ sum + (a.contents?.flatMap((c) => c.media || []).length || 0), 0),
53
+ });
54
+ });
55
+
56
+ telemetry.endSpan(chunkingSpan, {
57
+ status: "ok",
58
+ output: { batchCount: batches.length },
59
+ });
60
+ }
61
+
62
+ return batches;
28
63
  };
29
64
 
30
65
  export const extractWithPrompt = async <T>(options: {
@@ -38,6 +73,8 @@ export const extractWithPrompt = async <T>(options: {
38
73
  strict?: boolean;
39
74
  debug?: DebugLogger;
40
75
  callId?: string;
76
+ telemetry?: TelemetryAdapter;
77
+ parentSpan?: { id: string; traceId: string; name: string; kind: string; startTime: number; parentId?: string };
41
78
  }) => {
42
79
  const userContent = buildUserContent(options.user, options.artifacts);
43
80
  const result = await runWithRetries<T>({
@@ -50,6 +87,8 @@ export const extractWithPrompt = async <T>(options: {
50
87
  strict: options.strict,
51
88
  debug: options.debug,
52
89
  callId: options.callId,
90
+ telemetry: options.telemetry,
91
+ parentSpan: options.parentSpan,
53
92
  });
54
93
 
55
94
  return result;
package/src/types.ts CHANGED
@@ -45,10 +45,34 @@ export type ExtractionResult<T> = {
45
45
  error?: Error;
46
46
  };
47
47
 
48
+ /**
49
+ * Telemetry adapter interface for tracing extraction operations.
50
+ * This is a minimal interface that matches the full TelemetryAdapter from @struktur/telemetry.
51
+ * SDK users should import adapters from @struktur/telemetry package.
52
+ */
53
+ export interface TelemetryAdapter {
54
+ readonly name: string;
55
+ readonly version: string;
56
+ initialize(): Promise<void>;
57
+ shutdown(): Promise<void>;
58
+ startSpan(context: {
59
+ name: string;
60
+ kind: "CHAIN" | "LLM" | "TOOL" | "AGENT" | "RETRIEVER" | "EMBEDDING" | "RERANKER";
61
+ parentSpan?: { id: string; traceId: string };
62
+ attributes?: Record<string, unknown>;
63
+ startTime?: number;
64
+ }): { id: string; traceId: string; name: string; kind: string; startTime: number; parentId?: string };
65
+ endSpan(span: { id: string }, result?: { status: "ok" | "error"; error?: Error; output?: unknown; latencyMs?: number }): void;
66
+ recordEvent(span: { id: string }, event: unknown): void;
67
+ setAttributes(span: { id: string }, attributes: Record<string, unknown>): void;
68
+ setContext(context: { sessionId?: string; userId?: string; metadata?: Record<string, unknown>; tags?: string[] }): void;
69
+ }
70
+
48
71
  export type StepInfo = {
49
72
  step: number;
50
73
  total?: number;
51
74
  label?: string;
75
+ detail?: string;
52
76
  };
53
77
 
54
78
  export type ProgressInfo = {
@@ -67,19 +91,47 @@ export type TokenUsageInfo = Usage & {
67
91
  };
68
92
 
69
93
  export type RetryInfo = {
70
- attempt: number;
71
- maxAttempts: number;
72
- reason?: string;
94
+ attempt: number;
95
+ maxAttempts: number;
96
+ reason?: string;
73
97
  };
74
98
 
75
- export type ExtractionEvents = {
76
- onStep?: (info: StepInfo) => void | Promise<void>;
77
- onMessage?: (info: MessageInfo) => void | Promise<void>;
78
- onProgress?: (info: ProgressInfo) => void | Promise<void>;
79
- onTokenUsage?: (info: TokenUsageInfo) => void | Promise<void>;
80
- onRetry?: (info: RetryInfo) => void | Promise<void>;
99
+ export type AgentToolStartInfo = {
100
+ toolName: string;
101
+ toolCallId: string;
102
+ args: Record<string, unknown>;
103
+ };
104
+
105
+ export type AgentToolEndInfo = {
106
+ toolCallId: string;
107
+ result?: Record<string, unknown>;
108
+ error?: string;
109
+ };
110
+
111
+ export type AgentMessageInfo = {
112
+ content: string;
113
+ role?: "assistant" | "user";
81
114
  };
82
115
 
116
+ export type AgentReasoningInfo = {
117
+ thought: string;
118
+ };
119
+
120
+ export type AgentEvents = {
121
+ onAgentToolStart?: (info: AgentToolStartInfo) => void | Promise<void>;
122
+ onAgentToolEnd?: (info: AgentToolEndInfo) => void | Promise<void>;
123
+ onAgentMessage?: (info: AgentMessageInfo) => void | Promise<void>;
124
+ onAgentReasoning?: (info: AgentReasoningInfo) => void | Promise<void>;
125
+ };
126
+
127
+ export type ExtractionEvents = {
128
+ onStep?: (info: StepInfo) => void | Promise<void>;
129
+ onMessage?: (info: MessageInfo) => void | Promise<void>;
130
+ onProgress?: (info: ProgressInfo) => void | Promise<void>;
131
+ onTokenUsage?: (info: TokenUsageInfo) => void | Promise<void>;
132
+ onRetry?: (info: RetryInfo) => void | Promise<void>;
133
+ } & AgentEvents;
134
+
83
135
  export type AnyJSONSchema = Record<string, unknown>;
84
136
  export type TypedJSONSchema<T> = JSONSchemaType<T>;
85
137
 
@@ -107,6 +159,12 @@ export type ExtractionOptions<T> = {
107
159
  events?: ExtractionEvents;
108
160
  debug?: DebugLogger;
109
161
  strict?: boolean;
162
+ /**
163
+ * Telemetry adapter for tracing extraction operations.
164
+ * Supports Phoenix (Arize), Langfuse, and other OpenTelemetry-compatible providers.
165
+ * Import from `@struktur/telemetry` package and pass the adapter here.
166
+ */
167
+ telemetry?: TelemetryAdapter | null;
110
168
  }
111
169
 
112
170
  export interface ExtractionStrategy<T> {
@@ -1,6 +1,7 @@
1
1
  Validation module
2
2
 
3
- - Purpose: Ajv schema validation and error shaping.
3
+ - Purpose: Schema validation and error shaping.
4
4
  - Key files: `validator.ts`.
5
- - Design: `validateOrThrow` compiles schemas and throws `SchemaValidationError` on failure; `createAjv` registers `ajv-formats` for common schema formats.
5
+ - Design: `validateOrThrow` compiles schemas and throws `SchemaValidationError` on failure; `createAjv` registers `ajv-formats` for common schema formats and adds custom `artifact-id` format for referencing images in artifacts.
6
+ - Custom formats: `artifact-id` validates strings matching pattern `artifact:ID/images/imageNUM.EXT` (e.g., `artifact:123456/images/image1.jpg`).
6
7
  - Tests: `validator.test.ts`.
@@ -82,6 +82,38 @@ test("createAjv supports common formats", () => {
82
82
  }
83
83
  });
84
84
 
85
+ test("createAjv supports artifact-id format", () => {
86
+ const ajv = createAjv();
87
+ const schema: JSONSchemaType<string> = { type: "string", format: "artifact-id" };
88
+
89
+ const validData = validateOrThrow<string>(ajv, schema, "artifact:123456/images/image1.jpg");
90
+ expect(validData).toBe("artifact:123456/images/image1.jpg");
91
+
92
+ const validData2 = validateOrThrow<string>(ajv, schema, "artifact:abc-xyz/images/image10.png");
93
+ expect(validData2).toBe("artifact:abc-xyz/images/image10.png");
94
+
95
+ try {
96
+ validateOrThrow<string>(ajv, schema, "not-an-artifact-id");
97
+ throw new Error("Expected validation error");
98
+ } catch (error) {
99
+ expect(error).toBeInstanceOf(SchemaValidationError);
100
+ }
101
+
102
+ try {
103
+ validateOrThrow<string>(ajv, schema, "artifact:123/images/image");
104
+ throw new Error("Expected validation error");
105
+ } catch (error) {
106
+ expect(error).toBeInstanceOf(SchemaValidationError);
107
+ }
108
+
109
+ try {
110
+ validateOrThrow<string>(ajv, schema, "https://example.com/image.jpg");
111
+ throw new Error("Expected validation error");
112
+ } catch (error) {
113
+ expect(error).toBeInstanceOf(SchemaValidationError);
114
+ }
115
+ });
116
+
85
117
  test("isRequiredError identifies required constraint violations", () => {
86
118
  const requiredError = {
87
119
  keyword: "required",
@@ -19,6 +19,8 @@ export class SchemaValidationError extends Error {
19
19
  }
20
20
  }
21
21
 
22
+ const ARTIFACT_ID_PATTERN = /^artifact:[^/]+\/images\/image\d+\.\w+$/;
23
+
22
24
  export const createAjv = () => {
23
25
  const ajv = new Ajv({
24
26
  allErrors: true,
@@ -26,6 +28,12 @@ export const createAjv = () => {
26
28
  allowUnionTypes: true,
27
29
  });
28
30
  addFormats(ajv);
31
+
32
+ ajv.addFormat("artifact-id", {
33
+ type: "string",
34
+ validate: (data: string) => ARTIFACT_ID_PATTERN.test(data),
35
+ });
36
+
29
37
  return ajv;
30
38
  };
31
39