npm - @struktur/sdk - Versions diffs - 2.1.2 → 2.2.0 - Mend

@struktur/sdk 2.1.2 → 2.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (105) hide show

package/dist/index.js +4111 -0
package/dist/index.js.map +1 -0
package/dist/parsers.js +492 -0
package/dist/parsers.js.map +1 -0
package/dist/strategies.js +2435 -0
package/dist/strategies.js.map +1 -0
package/package.json +24 -12
package/src/agent-cli-integration.test.ts +0 -47
package/src/agent-export.test.ts +0 -17
package/src/agent-tool-labels.test.ts +0 -50
package/src/artifacts/AGENTS.md +0 -16
package/src/artifacts/fileToArtifact.test.ts +0 -37
package/src/artifacts/fileToArtifact.ts +0 -44
package/src/artifacts/input.test.ts +0 -243
package/src/artifacts/input.ts +0 -360
package/src/artifacts/providers.test.ts +0 -19
package/src/artifacts/providers.ts +0 -7
package/src/artifacts/urlToArtifact.test.ts +0 -23
package/src/artifacts/urlToArtifact.ts +0 -19
package/src/auth/AGENTS.md +0 -11
package/src/auth/config.test.ts +0 -132
package/src/auth/config.ts +0 -186
package/src/auth/tokens.test.ts +0 -58
package/src/auth/tokens.ts +0 -229
package/src/chunking/AGENTS.md +0 -11
package/src/chunking/ArtifactBatcher.test.ts +0 -22
package/src/chunking/ArtifactBatcher.ts +0 -110
package/src/chunking/ArtifactSplitter.test.ts +0 -38
package/src/chunking/ArtifactSplitter.ts +0 -151
package/src/debug/AGENTS.md +0 -79
package/src/debug/logger.test.ts +0 -244
package/src/debug/logger.ts +0 -211
package/src/extract.test.ts +0 -22
package/src/extract.ts +0 -150
package/src/fields.test.ts +0 -681
package/src/fields.ts +0 -246
package/src/index.test.ts +0 -20
package/src/index.ts +0 -110
package/src/llm/AGENTS.md +0 -9
package/src/llm/LLMClient.test.ts +0 -394
package/src/llm/LLMClient.ts +0 -264
package/src/llm/RetryingRunner.test.ts +0 -174
package/src/llm/RetryingRunner.ts +0 -270
package/src/llm/message.test.ts +0 -42
package/src/llm/message.ts +0 -47
package/src/llm/models.test.ts +0 -82
package/src/llm/models.ts +0 -190
package/src/llm/resolveModel.ts +0 -86
package/src/merge/AGENTS.md +0 -6
package/src/merge/Deduplicator.test.ts +0 -108
package/src/merge/Deduplicator.ts +0 -45
package/src/merge/SmartDataMerger.test.ts +0 -177
package/src/merge/SmartDataMerger.ts +0 -56
package/src/parsers/AGENTS.md +0 -58
package/src/parsers/collect.test.ts +0 -56
package/src/parsers/collect.ts +0 -31
package/src/parsers/index.ts +0 -6
package/src/parsers/mime.test.ts +0 -91
package/src/parsers/mime.ts +0 -137
package/src/parsers/npm.ts +0 -26
package/src/parsers/pdf.test.ts +0 -394
package/src/parsers/pdf.ts +0 -194
package/src/parsers/runner.test.ts +0 -95
package/src/parsers/runner.ts +0 -177
package/src/parsers/types.ts +0 -29
package/src/prompts/AGENTS.md +0 -8
package/src/prompts/DeduplicationPrompt.test.ts +0 -41
package/src/prompts/DeduplicationPrompt.ts +0 -37
package/src/prompts/ExtractorPrompt.test.ts +0 -21
package/src/prompts/ExtractorPrompt.ts +0 -72
package/src/prompts/ParallelMergerPrompt.test.ts +0 -8
package/src/prompts/ParallelMergerPrompt.ts +0 -37
package/src/prompts/SequentialExtractorPrompt.test.ts +0 -24
package/src/prompts/SequentialExtractorPrompt.ts +0 -82
package/src/prompts/formatArtifacts.test.ts +0 -39
package/src/prompts/formatArtifacts.ts +0 -46
package/src/strategies/AGENTS.md +0 -6
package/src/strategies/DoublePassAutoMergeStrategy.test.ts +0 -53
package/src/strategies/DoublePassAutoMergeStrategy.ts +0 -410
package/src/strategies/DoublePassStrategy.test.ts +0 -48
package/src/strategies/DoublePassStrategy.ts +0 -266
package/src/strategies/ParallelAutoMergeStrategy.test.ts +0 -152
package/src/strategies/ParallelAutoMergeStrategy.ts +0 -345
package/src/strategies/ParallelStrategy.test.ts +0 -61
package/src/strategies/ParallelStrategy.ts +0 -208
package/src/strategies/SequentialAutoMergeStrategy.test.ts +0 -66
package/src/strategies/SequentialAutoMergeStrategy.ts +0 -325
package/src/strategies/SequentialStrategy.test.ts +0 -53
package/src/strategies/SequentialStrategy.ts +0 -142
package/src/strategies/SimpleStrategy.test.ts +0 -46
package/src/strategies/SimpleStrategy.ts +0 -94
package/src/strategies/concurrency.test.ts +0 -16
package/src/strategies/concurrency.ts +0 -14
package/src/strategies/index.test.ts +0 -20
package/src/strategies/index.ts +0 -7
package/src/strategies/utils.test.ts +0 -76
package/src/strategies/utils.ts +0 -95
package/src/tokenization.test.ts +0 -119
package/src/tokenization.ts +0 -71
package/src/types.test.ts +0 -25
package/src/types.ts +0 -174
package/src/validation/AGENTS.md +0 -7
package/src/validation/validator.test.ts +0 -204
package/src/validation/validator.ts +0 -90
package/tsconfig.json +0 -22

package/src/llm/RetryingRunner.test.ts DELETED Viewed

@@ -1,174 +0,0 @@
-import { test, expect } from "bun:test";
-import type { JSONSchemaType } from "ajv";
-import { runWithRetries } from "./RetryingRunner";
-type Output = { title: string };
-const schema: JSONSchemaType<Output> = {
-  type: "object",
-  properties: { title: { type: "string" } },
-  required: ["title"],
-  additionalProperties: false,
-};
-test("runWithRetries emits onRetry event when retrying", async () => {
-  let calls = 0;
-  const retryEvents: Array<{ attempt: number; maxAttempts: number; reason?: string }> = [];
-  const result = await runWithRetries<Output>({
-    model: {},
-    schema,
-    system: "sys",
-    user: "user",
-    execute: async () => {
-      calls += 1;
-      if (calls === 1) {
-        return {
-          data: { title: 123 } as unknown as Output,
-          usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 },
-        };
-      }
-      return {
-        data: { title: "ok" },
-        usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 },
-      };
-    },
-    events: {
-      onRetry: (info) => {
-        retryEvents.push(info);
-      },
-    },
-  });
-  expect(result.data.title).toBe("ok");
-  expect(calls).toBe(2);
-  expect(retryEvents).toHaveLength(1);
-  expect(retryEvents[0]?.attempt).toBe(2);
-  expect(retryEvents[0]?.maxAttempts).toBe(3);
-  expect(retryEvents[0]?.reason).toBe("schema_validation_failed");
-});
-test("runWithRetries retries on validation error", async () => {
-  let calls = 0;
-  const result = await runWithRetries<Output>({
-    model: {},
-    schema,
-    system: "sys",
-    user: "user",
-    execute: async () => {
-      calls += 1;
-      if (calls === 1) {
-        return {
-          data: { title: 123 } as unknown as Output,
-          usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 },
-        };
-      }
-      return {
-        data: { title: "ok" },
-        usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 },
-      };
-    },
-  });
-  expect(result.data.title).toBe("ok");
-  expect(calls).toBe(2);
-});
-test("runWithRetries with strict=false retries on missing required fields until max attempts", async () => {
-  let calls = 0;
-  await expect(
-    runWithRetries<Output>({
-      model: {},
-      schema,
-      system: "sys",
-      user: "user",
-      strict: false,
-      maxAttempts: 2,
-      execute: async () => {
-        calls += 1;
-        return {
-          data: {} as Output,
-          usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 },
-        };
-      },
-    })
-  ).rejects.toThrow();
-  expect(calls).toBe(2);
-});
-test("runWithRetries with strict=true validates required fields on every attempt", async () => {
-  let calls = 0;
-  await expect(
-    runWithRetries<Output>({
-      model: {},
-      schema,
-      system: "sys",
-      user: "user",
-      strict: true,
-      maxAttempts: 2,
-      execute: async () => {
-        calls += 1;
-        return {
-          data: {} as Output,
-          usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 },
-        };
-      },
-    })
-  ).rejects.toThrow();
-  expect(calls).toBe(2);
-});
-test("runWithRetries with strict=false still validates type errors", async () => {
-  let calls = 0;
-  const result = await runWithRetries<Output>({
-    model: {},
-    schema,
-    system: "sys",
-    user: "user",
-    strict: false,
-    execute: async () => {
-      calls += 1;
-      if (calls === 1) {
-        return {
-          data: { title: 123 } as unknown as Output,
-          usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 },
-        };
-      }
-      return {
-        data: { title: "ok" },
-        usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 },
-      };
-    },
-  });
-  expect(result.data.title).toBe("ok");
-  expect(calls).toBe(2);
-});
-test("runWithRetries enforces strict validation on final attempt even with strict=false", async () => {
-  let calls = 0;
-  await expect(
-    runWithRetries<Output>({
-      model: {},
-      schema,
-      system: "sys",
-      user: "user",
-      strict: false,
-      maxAttempts: 2,
-      execute: async () => {
-        calls += 1;
-        return {
-          data: {} as Output,
-          usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 },
-        };
-      },
-    })
-  ).rejects.toThrow();
-  expect(calls).toBe(2);
-});

package/src/llm/RetryingRunner.ts DELETED Viewed

@@ -1,270 +0,0 @@
-import {
-  createAjv,
-  validateOrThrow,
-  SchemaValidationError,
-  validateAllowingMissingRequired,
-} from "../validation/validator";
-import type { ModelMessage } from "ai";
-import type { ExtractionEvents, Usage, TelemetryAdapter } from "../types";
-import type { DebugLogger } from "../debug/logger";
-import { generateStructured } from "./LLMClient";
-import type { UserContent } from "./message";
-export type RetryOptions<T> = {
-  model: unknown;
-  schema: unknown;
-  system: string;
-  user: UserContent;
-  events?: ExtractionEvents;
-  maxAttempts?: number;
-  schemaName?: string;
-  execute?: typeof generateStructured<T>;
-  strict?: boolean;
-  debug?: DebugLogger;
-  callId?: string;
-  /**
-   * Telemetry adapter for tracing validation and retries
-   */
-  telemetry?: TelemetryAdapter;
-  /**
-   * Parent span for creating hierarchical traces
-   */
-  parentSpan?: { id: string; traceId: string; name: string; kind: string; startTime: number; parentId?: string };
-};
-export const runWithRetries = async <T>(options: RetryOptions<T>) => {
-  const { telemetry, parentSpan } = options;
-  // Start validation/retry span if telemetry is enabled
-  const retrySpan = telemetry?.startSpan({
-    name: "struktur.validation_retry",
-    kind: "CHAIN",
-    parentSpan,
-    attributes: {
-      "retry.max_attempts": options.maxAttempts ?? 3,
-      "retry.schema_name": options.schemaName ?? "extract",
-    },
-  });
-  const ajv = createAjv();
-  const maxAttempts = options.maxAttempts ?? 3;
-  const messages: ModelMessage[] = [{ role: "user", content: options.user }];
-  const debug = options.debug;
-  const callId =
-    options.callId ??
-    `call_${Date.now()}_${Math.random().toString(36).slice(2, 11)}`;
-  let usage: Usage = { inputTokens: 0, outputTokens: 0, totalTokens: 0 };
-  let lastError: Error | undefined;
-  // Log LLM call start
-  const systemLength = options.system.length;
-  const userLength =
-    typeof options.user === "string"
-      ? options.user.length
-      : JSON.stringify(options.user).length;
-  debug?.llmCallStart({
-    callId,
-    model: JSON.stringify(options.model),
-    schemaName: options.schemaName,
-    systemLength,
-    userLength,
-    artifactCount: Array.isArray(options.user) ? options.user.length : 0,
-  });
-  debug?.promptSystem({ callId, system: options.system });
-  debug?.promptUser({ callId, user: options.user });
-  for (let attempt = 1; attempt <= maxAttempts; attempt += 1) {
-    const executor = options.execute ?? generateStructured;
-    const isFinalAttempt = attempt === maxAttempts;
-    const useStrictValidation = options.strict === true || isFinalAttempt;
-    debug?.validationStart({
-      callId,
-      attempt,
-      maxAttempts,
-      strict: useStrictValidation,
-    });
-    const startTime = Date.now();
-    const result = await executor({
-      model: options.model,
-      schema: options.schema,
-      schemaName: options.schemaName,
-      system: options.system,
-      user: options.user,
-      messages,
-      strict: options.strict,
-      telemetry,
-      parentSpan: retrySpan,
-    });
-    const durationMs = Date.now() - startTime;
-    usage = {
-      inputTokens: usage.inputTokens + result.usage.inputTokens,
-      outputTokens: usage.outputTokens + result.usage.outputTokens,
-      totalTokens: usage.totalTokens + result.usage.totalTokens,
-    };
-    debug?.rawResponse({ callId, response: result.data });
-    try {
-      if (useStrictValidation) {
-        const validated = validateOrThrow<T>(
-          ajv,
-          options.schema as never,
-          result.data,
-        );
-        debug?.validationSuccess({ callId, attempt });
-        debug?.llmCallComplete({
-          callId,
-          success: true,
-          inputTokens: usage.inputTokens,
-          outputTokens: usage.outputTokens,
-          totalTokens: usage.totalTokens,
-          durationMs,
-        });
-        // Record successful validation
-        if (retrySpan && telemetry) {
-          telemetry.recordEvent(retrySpan, {
-            type: "validation",
-            attempt,
-            maxAttempts,
-            schema: options.schema,
-            input: result.data,
-            success: true,
-            latencyMs: durationMs,
-          });
-          telemetry.endSpan(retrySpan, {
-            status: "ok",
-            output: validated,
-            latencyMs: durationMs,
-          });
-        }
-        return { data: validated, usage };
-      } else {
-        const validationResult = validateAllowingMissingRequired<T>(
-          ajv,
-          options.schema as never,
-          result.data,
-          isFinalAttempt,
-        );
-        if (validationResult.valid) {
-          debug?.validationSuccess({ callId, attempt });
-          debug?.llmCallComplete({
-            callId,
-            success: true,
-            inputTokens: usage.inputTokens,
-            outputTokens: usage.outputTokens,
-            totalTokens: usage.totalTokens,
-            durationMs,
-          });
-          // Record successful validation
-          if (retrySpan && telemetry) {
-            telemetry.recordEvent(retrySpan, {
-              type: "validation",
-              attempt,
-              maxAttempts,
-              schema: options.schema,
-              input: result.data,
-              success: true,
-              latencyMs: durationMs,
-            });
-            telemetry.endSpan(retrySpan, {
-              status: "ok",
-              output: validationResult.data,
-              latencyMs: durationMs,
-            });
-          }
-          return { data: validationResult.data, usage };
-        }
-        throw new SchemaValidationError(
-          "Schema validation failed",
-          validationResult.errors,
-        );
-      }
-    } catch (error) {
-      lastError = error as Error;
-      if (error instanceof SchemaValidationError) {
-        debug?.validationFailed({
-          callId,
-          attempt,
-          errors: error.errors,
-        });
-        // Record failed validation
-        if (retrySpan && telemetry) {
-          telemetry.recordEvent(retrySpan, {
-            type: "validation",
-            attempt,
-            maxAttempts,
-            schema: options.schema,
-            input: result.data,
-            success: false,
-            errors: error.errors,
-            latencyMs: durationMs,
-          });
-        }
-        // Emit retry event before attempting retry
-        const nextAttempt = attempt + 1;
-        if (nextAttempt <= maxAttempts) {
-          await options.events?.onRetry?.({
-            attempt: nextAttempt,
-            maxAttempts,
-            reason: "schema_validation_failed",
-          });
-          debug?.retry({
-            callId,
-            attempt: nextAttempt,
-            maxAttempts,
-            reason: "schema_validation_failed",
-          });
-        }
-        const errorPayload = JSON.stringify(error.errors, null, 2);
-        const errorMessage = `<validation-errors>\n${errorPayload}\n</validation-errors>`;
-        messages.push({ role: "user", content: errorMessage });
-        await options.events?.onMessage?.({
-          role: "user",
-          content: errorMessage,
-        });
-        continue;
-      }
-      debug?.llmCallComplete({
-        callId,
-        success: false,
-        inputTokens: usage.inputTokens,
-        outputTokens: usage.outputTokens,
-        totalTokens: usage.totalTokens,
-        durationMs,
-        error: (error as Error).message,
-      });
-      // Record error in telemetry
-      if (retrySpan && telemetry) {
-        telemetry.endSpan(retrySpan, {
-          status: "error",
-          error: error as Error,
-          latencyMs: durationMs,
-        });
-      }
-      break;
-    }
-  }
-  throw lastError ?? new Error("Unknown extraction error");
-};

package/src/llm/message.test.ts DELETED Viewed

@@ -1,42 +0,0 @@
-import { test, expect } from "bun:test";
-import type { Artifact } from "../types";
-import { buildUserContent } from "./message";
-const makeArtifact = (contents: Artifact["contents"]): Artifact => ({
-  id: "a1",
-  type: "text",
-  raw: async () => Buffer.from(""),
-  contents,
-});
-test("buildUserContent returns text when no images", () => {
-  const artifacts = [makeArtifact([{ text: "hello" }])];
-  const content = buildUserContent("prompt", artifacts);
-  expect(content).toBe("prompt");
-});
-test("buildUserContent appends images in order", () => {
-  const artifacts: Artifact[] = [
-    makeArtifact([
-      { media: [{ type: "image", base64: "base" }] },
-      { media: [{ type: "image", url: "https://example.com/img.png" }] },
-    ]),
-    {
-      id: "a2",
-      type: "image",
-      raw: async () => Buffer.from(""),
-      contents: [{ media: [{ type: "image", contents: Buffer.from([1]) }] }],
-    },
-  ];
-  const content = buildUserContent("prompt", artifacts);
-  expect(Array.isArray(content)).toBe(true);
-  if (Array.isArray(content)) {
-    expect(content[0]).toEqual({ type: "text", text: "prompt" });
-    expect(content[1]).toEqual({ type: "image", image: "base" });
-    expect(content[2]).toEqual({ type: "image", image: "https://example.com/img.png" });
-    expect(content[3]).toEqual({ type: "image", image: Buffer.from([1]) });
-  }
-});

package/src/llm/message.ts DELETED Viewed

@@ -1,47 +0,0 @@
-import type { Artifact } from "../types";
-export type ImagePart = {
-  type: "image";
-  image: string | Buffer;
-};
-export type TextPart = {
-  type: "text";
-  text: string;
-};
-export type UserContent = string | Array<TextPart | ImagePart>;
-const collectImages = (artifacts: Artifact[]): ImagePart[] => {
-  const parts: ImagePart[] = [];
-  for (const artifact of artifacts) {
-    for (const content of artifact.contents) {
-      if (!content.media?.length) {
-        continue;
-      }
-      for (const media of content.media) {
-        if (media.contents) {
-          parts.push({ type: "image", image: media.contents });
-        } else if (media.base64) {
-          parts.push({ type: "image", image: media.base64 });
-        } else if (media.url) {
-          parts.push({ type: "image", image: media.url });
-        }
-      }
-    }
-  }
-  return parts;
-};
-export const buildUserContent = (text: string, artifacts: Artifact[]): UserContent => {
-  const images = collectImages(artifacts);
-  if (images.length === 0) {
-    return text;
-  }
-  return [{ type: "text", text }, ...images];
-};

package/src/llm/models.test.ts DELETED Viewed

@@ -1,82 +0,0 @@
-import { test, expect } from "bun:test";
-import { __testing__ } from "./models";
-test("parseOpenAiModels returns model ids", () => {
-  const models = __testing__.parseOpenAiModels({
-    object: "list",
-    data: [{ id: "gpt-4o-mini" }, { id: "gpt-4o" }],
-  });
-  expect(models).toEqual(["gpt-4o-mini", "gpt-4o"]);
-});
-test("parseAnthropicModels returns model ids", () => {
-  const models = __testing__.parseAnthropicModels({
-    data: [{ id: "claude-3-5-sonnet-20241022" }],
-  });
-  expect(models).toEqual(["claude-3-5-sonnet-20241022"]);
-});
-test("parseGoogleModels strips models prefix", () => {
-  const models = __testing__.parseGoogleModels({
-    models: [{ name: "models/gemini-1.5-flash" }],
-  });
-  expect(models).toEqual(["gemini-1.5-flash"]);
-});
-test("parseOpenRouterModels returns model ids", () => {
-  const models = __testing__.parseOpenRouterModels({
-    data: [{ id: "openai/gpt-4o" }, { id: "anthropic/claude-3.5-sonnet" }],
-  });
-  expect(models).toEqual(["openai/gpt-4o", "anthropic/claude-3.5-sonnet"]);
-});
-test("parseOpenAiModels handles empty data", () => {
-  const models = __testing__.parseOpenAiModels({});
-  expect(models).toEqual([]);
-});
-test("parseOpenAiModels filters out undefined ids", () => {
-  const models = __testing__.parseOpenAiModels({
-    data: [{ id: "gpt-4" }, { notId: "bad" }],
-  });
-  expect(models).toEqual(["gpt-4"]);
-});
-test("parseGoogleModels handles empty models", () => {
-  const models = __testing__.parseGoogleModels({});
-  expect(models).toEqual([]);
-});
-test("pickCheapestModel prefers known cheap models", () => {
-  const models = ["gpt-4o", "gpt-4o-mini"];
-  expect(__testing__.pickCheapestModel("openai", models)).toBe("gpt-4o-mini");
-});
-test("pickCheapestModel returns first model if no preference matches", () => {
-  const models = ["unknown-model-1", "unknown-model-2"];
-  expect(__testing__.pickCheapestModel("openai", models)).toBe("unknown-model-1");
-});
-test("pickCheapestModel matches prefix for versioned models", () => {
-  const models = ["gpt-4o-mini-2024-07-18", "gpt-4o-2024-05-13"];
-  expect(__testing__.pickCheapestModel("openai", models)).toBe("gpt-4o-mini-2024-07-18");
-});
-test("pickCheapestModel handles anthropic preferences", () => {
-  const models = ["claude-3-opus", "claude-3-5-haiku-20241022"];
-  expect(__testing__.pickCheapestModel("anthropic", models)).toBe("claude-3-5-haiku-20241022");
-});
-test("pickCheapestModel handles google preferences", () => {
-  const models = ["gemini-1.5-pro", "gemini-2.0-flash"];
-  expect(__testing__.pickCheapestModel("google", models)).toBe("gemini-2.0-flash");
-});
-test("pickCheapestModel handles unknown provider", () => {
-  const models = ["model-a", "model-b"];
-  expect(__testing__.pickCheapestModel("unknown", models)).toBe("model-a");
-});