@struktur/sdk 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. package/README.md +79 -0
  2. package/package.json +33 -0
  3. package/src/artifacts/AGENTS.md +16 -0
  4. package/src/artifacts/fileToArtifact.test.ts +37 -0
  5. package/src/artifacts/fileToArtifact.ts +44 -0
  6. package/src/artifacts/input.test.ts +243 -0
  7. package/src/artifacts/input.ts +360 -0
  8. package/src/artifacts/providers.test.ts +19 -0
  9. package/src/artifacts/providers.ts +7 -0
  10. package/src/artifacts/urlToArtifact.test.ts +23 -0
  11. package/src/artifacts/urlToArtifact.ts +19 -0
  12. package/src/auth/AGENTS.md +11 -0
  13. package/src/auth/config.test.ts +132 -0
  14. package/src/auth/config.ts +129 -0
  15. package/src/auth/tokens.test.ts +58 -0
  16. package/src/auth/tokens.ts +229 -0
  17. package/src/chunking/AGENTS.md +11 -0
  18. package/src/chunking/ArtifactBatcher.test.ts +22 -0
  19. package/src/chunking/ArtifactBatcher.ts +110 -0
  20. package/src/chunking/ArtifactSplitter.test.ts +38 -0
  21. package/src/chunking/ArtifactSplitter.ts +151 -0
  22. package/src/debug/AGENTS.md +79 -0
  23. package/src/debug/logger.test.ts +244 -0
  24. package/src/debug/logger.ts +211 -0
  25. package/src/extract.test.ts +22 -0
  26. package/src/extract.ts +114 -0
  27. package/src/fields.test.ts +663 -0
  28. package/src/fields.ts +239 -0
  29. package/src/index.test.ts +20 -0
  30. package/src/index.ts +93 -0
  31. package/src/llm/AGENTS.md +9 -0
  32. package/src/llm/LLMClient.test.ts +196 -0
  33. package/src/llm/LLMClient.ts +106 -0
  34. package/src/llm/RetryingRunner.test.ts +174 -0
  35. package/src/llm/RetryingRunner.ts +188 -0
  36. package/src/llm/message.test.ts +42 -0
  37. package/src/llm/message.ts +47 -0
  38. package/src/llm/models.test.ts +82 -0
  39. package/src/llm/models.ts +190 -0
  40. package/src/merge/AGENTS.md +6 -0
  41. package/src/merge/Deduplicator.test.ts +108 -0
  42. package/src/merge/Deduplicator.ts +45 -0
  43. package/src/merge/SmartDataMerger.test.ts +177 -0
  44. package/src/merge/SmartDataMerger.ts +56 -0
  45. package/src/parsers/AGENTS.md +58 -0
  46. package/src/parsers/collect.test.ts +56 -0
  47. package/src/parsers/collect.ts +31 -0
  48. package/src/parsers/index.ts +6 -0
  49. package/src/parsers/mime.test.ts +91 -0
  50. package/src/parsers/mime.ts +137 -0
  51. package/src/parsers/npm.ts +26 -0
  52. package/src/parsers/pdf.test.ts +394 -0
  53. package/src/parsers/pdf.ts +194 -0
  54. package/src/parsers/runner.test.ts +95 -0
  55. package/src/parsers/runner.ts +177 -0
  56. package/src/parsers/types.ts +29 -0
  57. package/src/prompts/AGENTS.md +8 -0
  58. package/src/prompts/DeduplicationPrompt.test.ts +41 -0
  59. package/src/prompts/DeduplicationPrompt.ts +37 -0
  60. package/src/prompts/ExtractorPrompt.test.ts +21 -0
  61. package/src/prompts/ExtractorPrompt.ts +72 -0
  62. package/src/prompts/ParallelMergerPrompt.test.ts +8 -0
  63. package/src/prompts/ParallelMergerPrompt.ts +37 -0
  64. package/src/prompts/SequentialExtractorPrompt.test.ts +24 -0
  65. package/src/prompts/SequentialExtractorPrompt.ts +82 -0
  66. package/src/prompts/formatArtifacts.test.ts +39 -0
  67. package/src/prompts/formatArtifacts.ts +46 -0
  68. package/src/strategies/AGENTS.md +6 -0
  69. package/src/strategies/DoublePassAutoMergeStrategy.test.ts +53 -0
  70. package/src/strategies/DoublePassAutoMergeStrategy.ts +270 -0
  71. package/src/strategies/DoublePassStrategy.test.ts +48 -0
  72. package/src/strategies/DoublePassStrategy.ts +179 -0
  73. package/src/strategies/ParallelAutoMergeStrategy.test.ts +152 -0
  74. package/src/strategies/ParallelAutoMergeStrategy.ts +241 -0
  75. package/src/strategies/ParallelStrategy.test.ts +61 -0
  76. package/src/strategies/ParallelStrategy.ts +157 -0
  77. package/src/strategies/SequentialAutoMergeStrategy.test.ts +66 -0
  78. package/src/strategies/SequentialAutoMergeStrategy.ts +222 -0
  79. package/src/strategies/SequentialStrategy.test.ts +53 -0
  80. package/src/strategies/SequentialStrategy.ts +119 -0
  81. package/src/strategies/SimpleStrategy.test.ts +46 -0
  82. package/src/strategies/SimpleStrategy.ts +74 -0
  83. package/src/strategies/concurrency.test.ts +16 -0
  84. package/src/strategies/concurrency.ts +14 -0
  85. package/src/strategies/index.test.ts +20 -0
  86. package/src/strategies/index.ts +7 -0
  87. package/src/strategies/utils.test.ts +76 -0
  88. package/src/strategies/utils.ts +56 -0
  89. package/src/tokenization.test.ts +119 -0
  90. package/src/tokenization.ts +71 -0
  91. package/src/types.test.ts +25 -0
  92. package/src/types.ts +116 -0
  93. package/src/validation/AGENTS.md +6 -0
  94. package/src/validation/validator.test.ts +172 -0
  95. package/src/validation/validator.ts +82 -0
  96. package/tsconfig.json +22 -0
@@ -0,0 +1,190 @@
1
+ import type { ProviderModelsResult } from "../types";
2
+ import { resolveProviderEnvVar, resolveProviderToken } from "../auth/tokens";
3
+
4
+ const openAiModelsUrl = "https://api.openai.com/v1/models";
5
+ const anthropicModelsUrl = "https://api.anthropic.com/v1/models";
6
+ const googleModelsUrl = "https://generativelanguage.googleapis.com/v1beta/models";
7
+ const openRouterModelsUrl = "https://openrouter.ai/api/v1/models";
8
+
9
+ const getTokenForProvider = async (provider: string) => {
10
+ const envVar = resolveProviderEnvVar(provider);
11
+ if (envVar && process.env[envVar]) {
12
+ return process.env[envVar] as string;
13
+ }
14
+ return await resolveProviderToken(provider);
15
+ };
16
+
17
+ const parseOpenAiModels = (json: unknown) => {
18
+ const data = (json as { data?: Array<{ id?: string }> } | undefined)?.data ?? [];
19
+ return data.map((item) => item.id).filter((id): id is string => typeof id === "string");
20
+ };
21
+
22
+ const parseAnthropicModels = (json: unknown) => {
23
+ const data = (json as { data?: Array<{ id?: string }> } | undefined)?.data ?? [];
24
+ return data.map((item) => item.id).filter((id): id is string => typeof id === "string");
25
+ };
26
+
27
+ const parseGoogleModels = (json: unknown) => {
28
+ const data = (json as { models?: Array<{ name?: string }> } | undefined)?.models ?? [];
29
+ return data
30
+ .map((item) => item.name)
31
+ .filter((name): name is string => typeof name === "string")
32
+ .map((name) => name.replace(/^models\//, ""));
33
+ };
34
+
35
+ const parseOpenRouterModels = (json: unknown) => {
36
+ const data = (json as { data?: Array<{ id?: string }> } | undefined)?.data ?? [];
37
+ return data.map((item) => item.id).filter((id): id is string => typeof id === "string");
38
+ };
39
+
40
+ const requestModels = async (provider: string, token: string): Promise<string[]> => {
41
+ if (provider === "openai") {
42
+ const response = await fetch(openAiModelsUrl, {
43
+ headers: { Authorization: `Bearer ${token}` },
44
+ });
45
+ if (!response.ok) {
46
+ throw new Error(await response.text());
47
+ }
48
+ const json = (await response.json()) as unknown;
49
+ return parseOpenAiModels(json);
50
+ }
51
+
52
+ if (provider === "anthropic") {
53
+ const response = await fetch(anthropicModelsUrl, {
54
+ headers: {
55
+ "x-api-key": token,
56
+ "anthropic-version": "2023-06-01",
57
+ },
58
+ });
59
+ if (!response.ok) {
60
+ throw new Error(await response.text());
61
+ }
62
+ const json = (await response.json()) as unknown;
63
+ return parseAnthropicModels(json);
64
+ }
65
+
66
+ if (provider === "google") {
67
+ const response = await fetch(`${googleModelsUrl}?key=${encodeURIComponent(token)}`);
68
+ if (!response.ok) {
69
+ throw new Error(await response.text());
70
+ }
71
+ const json = (await response.json()) as unknown;
72
+ return parseGoogleModels(json);
73
+ }
74
+
75
+ if (provider === "openrouter") {
76
+ const response = await fetch(openRouterModelsUrl, {
77
+ headers: { Authorization: `Bearer ${token}` },
78
+ });
79
+ if (!response.ok) {
80
+ throw new Error(await response.text());
81
+ }
82
+ const json = (await response.json()) as unknown;
83
+ return parseOpenRouterModels(json);
84
+ }
85
+
86
+ if (provider === "opencode") {
87
+ // OpenCode doesn't have a public models endpoint, return known models
88
+ return [
89
+ "gpt-5.2",
90
+ "gpt-5.2-codex",
91
+ "gpt-5.1",
92
+ "gpt-5.1-codex",
93
+ "gpt-5.1-codex-max",
94
+ "gpt-5.1-codex-mini",
95
+ "gpt-5",
96
+ "gpt-5-codex",
97
+ "gpt-5-nano",
98
+ "claude-opus-4-6",
99
+ "claude-opus-4-5",
100
+ "claude-opus-4-1",
101
+ "claude-sonnet-4-6",
102
+ "claude-sonnet-4-5",
103
+ "claude-sonnet-4",
104
+ "claude-haiku-4-5",
105
+ "claude-haiku-3.5",
106
+ "gemini-3.1-pro",
107
+ "gemini-3-pro",
108
+ "gemini-3-flash",
109
+ "minimax-m2.5",
110
+ "minimax-m2.5-free",
111
+ "minimax-m2.1",
112
+ "glm-5",
113
+ "glm-5-free",
114
+ "glm-4.7",
115
+ "glm-4.6",
116
+ "kimi-k2.5",
117
+ "kimi-k2.5-free",
118
+ "kimi-k2-thinking",
119
+ "kimi-k2",
120
+ "qwen3-coder",
121
+ "big-pickle",
122
+ ];
123
+ }
124
+
125
+ throw new Error(`Unsupported provider: ${provider}`);
126
+ };
127
+
128
+ const cheapestModelPreferences: Record<string, string[]> = {
129
+ openai: ["gpt-4.1-nano", "gpt-4.1-mini", "gpt-4o-mini", "gpt-4o"],
130
+ anthropic: ["claude-3-5-haiku", "claude-3-haiku"],
131
+ google: ["gemini-1.5-flash-8b", "gemini-1.5-flash", "gemini-2.0-flash", "gemini-1.5-pro"],
132
+ opencode: ["gpt-5-nano", "claude-haiku-3.5", "gemini-3-flash", "kimi-k2-free", "glm-5-free", "minimax-m2.5-free"],
133
+ openrouter: ["openai/gpt-4o-mini", "anthropic/claude-3.5-haiku", "google/gemini-flash-1.5"],
134
+ };
135
+
136
+ const matchesPreference = (model: string, preference: string) => {
137
+ return model === preference || model.startsWith(`${preference}-`);
138
+ };
139
+
140
+ export const listProviderModels = async (provider: string): Promise<ProviderModelsResult> => {
141
+ const token = await getTokenForProvider(provider);
142
+ if (!token) {
143
+ return { provider, ok: false, error: "No token available" };
144
+ }
145
+
146
+ try {
147
+ const models = await requestModels(provider, token);
148
+ return { provider, ok: true, models };
149
+ } catch (error) {
150
+ const message = error instanceof Error ? error.message : String(error);
151
+ return { provider, ok: false, error: message };
152
+ }
153
+ };
154
+
155
+ export const listAllProviderModels = async (providers: string[]) => {
156
+ const results = await Promise.all(providers.map((provider) => listProviderModels(provider)));
157
+ return results;
158
+ };
159
+
160
+ export const pickCheapestModel = (provider: string, models: string[]) => {
161
+ const preferences = cheapestModelPreferences[provider] ?? [];
162
+ for (const preference of preferences) {
163
+ const match = models.find((model) => matchesPreference(model, preference));
164
+ if (match) {
165
+ return match;
166
+ }
167
+ }
168
+ return models[0];
169
+ };
170
+
171
+ export const resolveCheapestModel = async (provider: string) => {
172
+ const result = await listProviderModels(provider);
173
+ if (!result.ok) {
174
+ throw new Error(result.error ?? `Unable to list models for provider: ${provider}`);
175
+ }
176
+ const models = result.models ?? [];
177
+ const model = pickCheapestModel(provider, models);
178
+ if (!model) {
179
+ throw new Error(`No models available for provider: ${provider}`);
180
+ }
181
+ return model;
182
+ };
183
+
184
+ export const __testing__ = {
185
+ parseOpenAiModels,
186
+ parseAnthropicModels,
187
+ parseGoogleModels,
188
+ parseOpenRouterModels,
189
+ pickCheapestModel,
190
+ };
@@ -0,0 +1,6 @@
1
+ Merge module
2
+
3
+ - Purpose: schema-aware merging and deduplication of extracted data.
4
+ - Key files: `SmartDataMerger.ts`, `Deduplicator.ts`.
5
+ - Design: arrays concatenate, objects shallow-merge, scalars prefer new values; dedupe uses CRC32 hashing.
6
+ - Tests: `SmartDataMerger.test.ts`, `Deduplicator.test.ts`.
@@ -0,0 +1,108 @@
1
+ import { test, expect } from "bun:test";
2
+ import { findExactDuplicatesWithHashing, deduplicateByIndices, fnv1a32 } from "./Deduplicator";
3
+
4
+ test("fnv1a32: official test vectors from lcn2/fnv", () => {
5
+ expect(fnv1a32("")).toBe(0x811c9dc5);
6
+ expect(fnv1a32("a")).toBe(0xe40c292c);
7
+ expect(fnv1a32("b")).toBe(0xe70c2de5);
8
+ expect(fnv1a32("c")).toBe(0xe60c2c52);
9
+ expect(fnv1a32("d")).toBe(0xe10c2473);
10
+ expect(fnv1a32("e")).toBe(0xe00c22e0);
11
+ expect(fnv1a32("f")).toBe(0xe30c2799);
12
+ expect(fnv1a32("fo")).toBe(0x6222e842);
13
+ expect(fnv1a32("foo")).toBe(0xa9f37ed7);
14
+ expect(fnv1a32("foob")).toBe(0x3f5076ef);
15
+ expect(fnv1a32("fooba")).toBe(0x39aaa18a);
16
+ expect(fnv1a32("foobar")).toBe(0xbf9cf968);
17
+ expect(fnv1a32("chongo was here!\n")).toBe(0xd49930d5);
18
+ });
19
+
20
+ test("fnv1a32: consistent results", () => {
21
+ const str = "test string for consistency";
22
+ const hash1 = fnv1a32(str);
23
+ const hash2 = fnv1a32(str);
24
+ const hash3 = fnv1a32(str);
25
+ expect(hash1).toBe(hash2);
26
+ expect(hash2).toBe(hash3);
27
+ });
28
+
29
+ test("fnv1a32: different strings produce different hashes", () => {
30
+ const strings = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"];
31
+ const hashes = strings.map(fnv1a32);
32
+ const uniqueHashes = new Set(hashes);
33
+ expect(uniqueHashes.size).toBe(strings.length);
34
+ });
35
+
36
+ test("fnv1a32: handles unicode", () => {
37
+ const hash1 = fnv1a32("hello");
38
+ const hash2 = fnv1a32("héllo");
39
+ const hash3 = fnv1a32("你好");
40
+ expect(hash1).not.toBe(hash2);
41
+ expect(typeof hash3).toBe("number");
42
+ expect(hash3).toBeGreaterThan(0);
43
+ });
44
+
45
+ test("fnv1a32: handles special characters", () => {
46
+ const hash1 = fnv1a32('{"key":"value"}');
47
+ const hash2 = fnv1a32('{"key":"value2"}');
48
+ expect(hash1).not.toBe(hash2);
49
+ });
50
+
51
+ test("fnv1a32: returns unsigned 32-bit integer", () => {
52
+ const hash = fnv1a32("some test string");
53
+ expect(hash).toBeGreaterThanOrEqual(0);
54
+ expect(hash).toBeLessThan(4294967296);
55
+ expect(Number.isInteger(hash)).toBe(true);
56
+ });
57
+
58
+ test("fnv1a32: collision resistance for similar strings", () => {
59
+ const strings = [
60
+ "item1", "item2", "item3", "item4", "item5",
61
+ "item6", "item7", "item8", "item9", "item10",
62
+ "Item1", "ITEM1", "itemA", "itemB", "itemC",
63
+ ];
64
+ const hashes = strings.map(fnv1a32);
65
+ const uniqueHashes = new Set(hashes);
66
+ expect(uniqueHashes.size).toBe(strings.length);
67
+ });
68
+
69
+ test("findExactDuplicatesWithHashing finds duplicates", () => {
70
+ const duplicates = findExactDuplicatesWithHashing([
71
+ { id: 1, name: "A" },
72
+ { id: 1, name: "A" },
73
+ { id: 2, name: "B" },
74
+ ]);
75
+
76
+ expect(duplicates).toEqual([1]);
77
+ });
78
+
79
+ test("deduplicateByIndices removes by index", () => {
80
+ const items = ["a", "b", "c"];
81
+ const result = deduplicateByIndices(items, [1]);
82
+ expect(result).toEqual(["a", "c"]);
83
+ });
84
+
85
+ test("findExactDuplicatesWithHashing handles empty array", () => {
86
+ expect(findExactDuplicatesWithHashing([])).toEqual([]);
87
+ });
88
+
89
+ test("findExactDuplicatesWithHashing handles no duplicates", () => {
90
+ expect(findExactDuplicatesWithHashing([1, 2, 3])).toEqual([]);
91
+ });
92
+
93
+ test("findExactDuplicatesWithHashing handles all duplicates", () => {
94
+ expect(findExactDuplicatesWithHashing([1, 1, 1])).toEqual([1, 2]);
95
+ });
96
+
97
+ test("findExactDuplicatesWithHashing handles complex objects", () => {
98
+ const obj = { a: 1, b: { c: 2 } };
99
+ const duplicates = findExactDuplicatesWithHashing([obj, obj, { a: 1, b: { c: 3 } }]);
100
+ expect(duplicates).toEqual([1]);
101
+ });
102
+
103
+ test("findExactDuplicatesWithHashing: key order doesn't matter", () => {
104
+ const obj1 = { a: 1, b: 2 };
105
+ const obj2 = { b: 2, a: 1 };
106
+ const duplicates = findExactDuplicatesWithHashing([obj1, obj2]);
107
+ expect(duplicates).toEqual([1]);
108
+ });
@@ -0,0 +1,45 @@
1
+ export const fnv1a32 = (str: string): number => {
2
+ let hash = 2166136261;
3
+ for (let i = 0; i < str.length; i++) {
4
+ hash ^= str.charCodeAt(i);
5
+ hash = Math.imul(hash, 16777619);
6
+ }
7
+ return hash >>> 0;
8
+ };
9
+
10
+ const stableStringify = (value: unknown): string => {
11
+ if (value === null || typeof value !== "object") {
12
+ return JSON.stringify(value);
13
+ }
14
+
15
+ if (Array.isArray(value)) {
16
+ return `[${value.map((item) => stableStringify(item)).join(",")}]`;
17
+ }
18
+
19
+ const entries = Object.entries(value as Record<string, unknown>)
20
+ .sort(([a], [b]) => a.localeCompare(b))
21
+ .map(([key, val]) => `"${key}":${stableStringify(val)}`);
22
+
23
+ return `{${entries.join(",")}}`;
24
+ };
25
+
26
+ export const findExactDuplicatesWithHashing = (items: unknown[]) => {
27
+ const seen = new Map<number, number>();
28
+ const duplicates: number[] = [];
29
+
30
+ items.forEach((item, index) => {
31
+ const hash = fnv1a32(stableStringify(item));
32
+ if (seen.has(hash)) {
33
+ duplicates.push(index);
34
+ return;
35
+ }
36
+ seen.set(hash, index);
37
+ });
38
+
39
+ return duplicates;
40
+ };
41
+
42
+ export const deduplicateByIndices = <T>(items: T[], indices: number[]) => {
43
+ const remove = new Set(indices);
44
+ return items.filter((_, index) => !remove.has(index));
45
+ };
@@ -0,0 +1,177 @@
1
+ import { test, expect } from "bun:test";
2
+ import { SmartDataMerger } from "./SmartDataMerger";
3
+
4
+ test("SmartDataMerger concatenates arrays and preserves scalars", () => {
5
+ const schema = {
6
+ type: "object",
7
+ properties: {
8
+ items: { type: "array" },
9
+ title: { type: "string" },
10
+ },
11
+ };
12
+
13
+ const merger = new SmartDataMerger(schema);
14
+ const result = merger.merge(
15
+ { items: [1], title: "A" },
16
+ { items: [2], title: "" }
17
+ );
18
+
19
+ expect(result.items).toEqual([1, 2]);
20
+ expect(result.title).toBe("A");
21
+ });
22
+
23
+ test("SmartDataMerger merges nested objects", () => {
24
+ const schema = {
25
+ type: "object",
26
+ properties: {
27
+ user: {
28
+ type: "object",
29
+ properties: {
30
+ name: { type: "string" },
31
+ email: { type: "string" },
32
+ },
33
+ },
34
+ },
35
+ };
36
+
37
+ const merger = new SmartDataMerger(schema);
38
+ const result = merger.merge(
39
+ { user: { name: "Alice" } },
40
+ { user: { email: "alice@example.com" } }
41
+ );
42
+
43
+ expect(result.user).toEqual({ name: "Alice", email: "alice@example.com" });
44
+ });
45
+
46
+ test("SmartDataMerger prefers new scalar values when not empty", () => {
47
+ const schema = {
48
+ type: "object",
49
+ properties: {
50
+ title: { type: "string" },
51
+ count: { type: "number" },
52
+ },
53
+ };
54
+
55
+ const merger = new SmartDataMerger(schema);
56
+ const result = merger.merge(
57
+ { title: "Old", count: 1 },
58
+ { title: "New", count: 2 }
59
+ );
60
+
61
+ expect(result.title).toBe("New");
62
+ expect(result.count).toBe(2);
63
+ });
64
+
65
+ test("SmartDataMerger preserves old value when new is null", () => {
66
+ const schema = {
67
+ type: "object",
68
+ properties: {
69
+ title: { type: "string" },
70
+ },
71
+ };
72
+
73
+ const merger = new SmartDataMerger(schema);
74
+ const result = merger.merge({ title: "Old" }, { title: null });
75
+
76
+ expect(result.title).toBe("Old");
77
+ });
78
+
79
+ test("SmartDataMerger preserves old value when new is undefined", () => {
80
+ const schema = {
81
+ type: "object",
82
+ properties: {
83
+ title: { type: "string" },
84
+ },
85
+ };
86
+
87
+ const merger = new SmartDataMerger(schema);
88
+ const result = merger.merge({ title: "Old" }, {});
89
+
90
+ expect(result.title).toBe("Old");
91
+ });
92
+
93
+ test("SmartDataMerger handles missing current value for arrays", () => {
94
+ const schema = {
95
+ type: "object",
96
+ properties: {
97
+ items: { type: "array" },
98
+ },
99
+ };
100
+
101
+ const merger = new SmartDataMerger(schema);
102
+ const result = merger.merge({}, { items: [1, 2] });
103
+
104
+ expect(result.items).toEqual([1, 2]);
105
+ });
106
+
107
+ test("SmartDataMerger handles missing new value for arrays", () => {
108
+ const schema = {
109
+ type: "object",
110
+ properties: {
111
+ items: { type: "array" },
112
+ },
113
+ };
114
+
115
+ const merger = new SmartDataMerger(schema);
116
+ const result = merger.merge({ items: [1, 2] }, {});
117
+
118
+ expect(result.items).toEqual([1, 2]);
119
+ });
120
+
121
+ test("SmartDataMerger handles non-array values for array schema", () => {
122
+ const schema = {
123
+ type: "object",
124
+ properties: {
125
+ items: { type: "array" },
126
+ },
127
+ };
128
+
129
+ const merger = new SmartDataMerger(schema);
130
+ const result = merger.merge({ items: "not-an-array" }, { items: [1] });
131
+
132
+ expect(result.items).toEqual([1]);
133
+ });
134
+
135
+ test("SmartDataMerger handles non-object values for object schema", () => {
136
+ const schema = {
137
+ type: "object",
138
+ properties: {
139
+ user: { type: "object", properties: {} },
140
+ },
141
+ };
142
+
143
+ const merger = new SmartDataMerger(schema);
144
+ const result = merger.merge({ user: "not-an-object" }, { user: { name: "Alice" } });
145
+
146
+ expect(result.user).toEqual({ name: "Alice" });
147
+ });
148
+
149
+ test("SmartDataMerger preserves properties not in schema", () => {
150
+ const schema = {
151
+ type: "object",
152
+ properties: {
153
+ title: { type: "string" },
154
+ },
155
+ };
156
+
157
+ const merger = new SmartDataMerger(schema);
158
+ const result = merger.merge(
159
+ { title: "A", extra: "preserved" },
160
+ { title: "B" }
161
+ );
162
+
163
+ expect(result.title).toBe("B");
164
+ expect(result.extra).toBe("preserved");
165
+ });
166
+
167
+ test("SmartDataMerger handles empty schema properties", () => {
168
+ const schema = {
169
+ type: "object",
170
+ properties: {},
171
+ };
172
+
173
+ const merger = new SmartDataMerger(schema);
174
+ const result = merger.merge({ title: "A" }, { title: "B" });
175
+
176
+ expect(result.title).toBe("A");
177
+ });
@@ -0,0 +1,56 @@
1
+ import type { AnyJSONSchema } from "../types";
2
+
3
+ const isArraySchema = (schema: Record<string, unknown>) => {
4
+ if (schema.type === "array") {
5
+ return true;
6
+ }
7
+ return false;
8
+ };
9
+
10
+ const isObjectSchema = (schema: Record<string, unknown>) => {
11
+ return schema.type === "object" && typeof schema.properties === "object";
12
+ };
13
+
14
+ export class SmartDataMerger {
15
+ private schema: AnyJSONSchema;
16
+
17
+ constructor(schema: AnyJSONSchema) {
18
+ this.schema = schema;
19
+ }
20
+
21
+ merge(currentData: Record<string, unknown>, newData: Record<string, unknown>) {
22
+ const merged: Record<string, unknown> = { ...currentData };
23
+ const properties =
24
+ (this.schema as { properties?: Record<string, Record<string, unknown>> })
25
+ .properties ?? {};
26
+
27
+ for (const [key, propSchema] of Object.entries(properties)) {
28
+ const currentValue = currentData[key];
29
+ const newValue = newData[key];
30
+
31
+ if (isArraySchema(propSchema)) {
32
+ merged[key] = [
33
+ ...(Array.isArray(currentValue) ? currentValue : []),
34
+ ...(Array.isArray(newValue) ? newValue : []),
35
+ ];
36
+ continue;
37
+ }
38
+
39
+ if (isObjectSchema(propSchema)) {
40
+ merged[key] = {
41
+ ...(typeof currentValue === "object" && currentValue ? currentValue : {}),
42
+ ...(typeof newValue === "object" && newValue ? newValue : {}),
43
+ };
44
+ continue;
45
+ }
46
+
47
+ if (newValue !== undefined && newValue !== null && newValue !== "") {
48
+ merged[key] = newValue;
49
+ } else if (currentValue !== undefined) {
50
+ merged[key] = currentValue;
51
+ }
52
+ }
53
+
54
+ return merged;
55
+ }
56
+ }
@@ -0,0 +1,58 @@
1
+ # Parsers module
2
+
3
+ - Purpose: detect MIME types, run external/npm/command parsers, and provide built-in PDF support.
4
+ - Key files: `types.ts`, `collect.ts`, `mime.ts`, `npm.ts`, `runner.ts`, `pdf.ts`, `index.ts`.
5
+
6
+ ## Types (`types.ts`)
7
+
8
+ - `NpmParserDef` — npm package parser definition (`type: "npm"`, `package: string`)
9
+ - `CommandFileDef` — command with `FILE_PATH` placeholder (`type: "command-file"`, `command: string`)
10
+ - `CommandStdinDef` — command that reads from stdin (`type: "command-stdin"`, `command: string`)
11
+ - `InlineParserDef` — inline handler function (`type: "inline"`, `handler: (buffer: Buffer) => Promise<Artifact>`)
12
+ - `ParserDef` — union of the four variants
13
+ - `ParsersConfig` — `Record<string, ParserDef>` keyed by MIME type
14
+ - `ParserInput` — `{ kind: "file"; path: string } | { kind: "buffer"; buffer: Buffer }`
15
+
16
+ ## npm Contract (`npm.ts`)
17
+
18
+ - `ParseStreamFn`, `ParseFileFn`, `DetectFileTypeFn`, `NpmParserModule` — interfaces that npm parser packages must implement.
19
+ - At least one of `parseStream` or `parseFile` must be exported.
20
+
21
+ ## collectStream (`collect.ts`)
22
+
23
+ - `collectStream(stream: ReadableStream<Uint8Array>): Promise<Buffer>` — public utility for npm parser authors to collect a stream into a Buffer.
24
+
25
+ ## MIME Detection (`mime.ts`)
26
+
27
+ Two-layer detection + npm detectFileType callbacks:
28
+
29
+ 1. **Magic bytes** (authoritative): PDF, PNG, JPEG, GIF, WebP, ZIP/Office
30
+ 2. **npm `detectFileType`**: called after magic bytes with first 512 bytes
31
+ 3. **Extension database**: fallback when no magic bytes match (file inputs only)
32
+
33
+ `detectMimeType({ buffer?, filePath?, mimeOverride?, npmParsers? }): Promise<string | null>`
34
+
35
+ ## Runner (`runner.ts`)
36
+
37
+ `runParser(def: ParserDef, input: ParserInput, mimeType: string): Promise<Artifact[]>`
38
+
39
+ - **npm**: Dynamic import, prefer `parseFile` for file inputs (zero-copy), prefer `parseStream` for buffer inputs. Falls back via temp-file if needed.
40
+ - **command-file**: Interpolates `FILE_PATH` in command, writes temp file for buffer inputs.
41
+ - **command-stdin**: Pipes input buffer to subprocess stdin; captures stdout as `SerializedArtifact[]` JSON.
42
+ - **inline**: Calls the handler function directly with the buffer (reads file into buffer if needed).
43
+
44
+ ## Built-in PDF Parser (`pdf.ts`)
45
+
46
+ `parsePdf(input: Buffer | ReadableStream<Uint8Array>, options?: ParsePdfOptions): Promise<Artifact>`
47
+
48
+ Uses `pdf-parse` (npm package). Extracts per-page text **and** embedded images into `ArtifactContent[]`
49
+ with `page` numbers set. Returns an `Artifact` with `type: "pdf"`.
50
+
51
+ - Text extraction: per-page via `parser.getText()`; falls back to full document text when no per-page info is available.
52
+ - Image extraction: per-page via `parser.getImage({ imageBuffer: false, imageDataUrl: true })`. Each embedded image is mapped to an `ArtifactImage` with `base64` (raw base64 string, data-URL prefix stripped), `width`, `height`, and `imageType: "embedded"`. Images are merged into the `media` array of the matching `ArtifactContent` entry. Pages that contain images but no text produce their own content entry. Image extraction failure is non-fatal — the parser continues and returns text-only content.
53
+ - Screenshot rendering: per-page via `parser.getScreenshot()`. Each page is rendered to a PNG image and added to the `media` array with `imageType: "screenshot"`. Screenshots are appended to any embedded images for the same page. Screenshot rendering failure is non-fatal — the parser continues without screenshots.
54
+ - `imageThreshold` defaults to 80 px (from pdf-parse), filtering out tiny decorative images.
55
+ - `ParsePdfOptions.includeImages` (default `true`): set to `false` to skip `getImage()` entirely and return text-only content. This is used by the `--no-images` CLI flag.
56
+ - `ParsePdfOptions.screenshots` (default `false`): set to `true` to render page screenshots and include them as images. This is used by the `--screenshots` CLI flag.
57
+ - `ParsePdfOptions.screenshotScale` (default `1.5`): scale factor for screenshot rendering. Higher values produce larger, higher-quality images.
58
+ - `ParsePdfOptions.screenshotWidth`: target width in pixels for screenshots. If specified, takes precedence over `screenshotScale` and height is calculated to maintain aspect ratio.