@struktur/sdk 2.1.2 → 2.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (200) hide show
  1. package/dist/artifacts/fileToArtifact.d.ts +8 -0
  2. package/dist/artifacts/fileToArtifact.d.ts.map +1 -0
  3. package/dist/artifacts/input.d.ts +60 -0
  4. package/dist/artifacts/input.d.ts.map +1 -0
  5. package/{src/artifacts/providers.ts → dist/artifacts/providers.d.ts} +2 -4
  6. package/dist/artifacts/providers.d.ts.map +1 -0
  7. package/dist/artifacts/urlToArtifact.d.ts +3 -0
  8. package/dist/artifacts/urlToArtifact.d.ts.map +1 -0
  9. package/dist/auth/config.d.ts +34 -0
  10. package/dist/auth/config.d.ts.map +1 -0
  11. package/dist/auth/tokens.d.ts +18 -0
  12. package/dist/auth/tokens.d.ts.map +1 -0
  13. package/dist/chunking/ArtifactBatcher.d.ts +11 -0
  14. package/dist/chunking/ArtifactBatcher.d.ts.map +1 -0
  15. package/dist/chunking/ArtifactSplitter.d.ts +10 -0
  16. package/dist/chunking/ArtifactSplitter.d.ts.map +1 -0
  17. package/dist/debug/logger.d.ts +169 -0
  18. package/dist/debug/logger.d.ts.map +1 -0
  19. package/dist/extract.d.ts +3 -0
  20. package/dist/extract.d.ts.map +1 -0
  21. package/dist/fields.d.ts +75 -0
  22. package/dist/fields.d.ts.map +1 -0
  23. package/dist/index.d.ts +24 -0
  24. package/dist/index.d.ts.map +1 -0
  25. package/dist/index.js +5603 -0
  26. package/dist/index.js.map +1 -0
  27. package/dist/llm/LLMClient.d.ts +40 -0
  28. package/dist/llm/LLMClient.d.ts.map +1 -0
  29. package/dist/llm/RetryingRunner.d.ts +37 -0
  30. package/dist/llm/RetryingRunner.d.ts.map +1 -0
  31. package/dist/llm/message.d.ts +12 -0
  32. package/dist/llm/message.d.ts.map +1 -0
  33. package/dist/llm/models.d.ts +13 -0
  34. package/dist/llm/models.d.ts.map +1 -0
  35. package/dist/llm/resolveModel.d.ts +3 -0
  36. package/dist/llm/resolveModel.d.ts.map +1 -0
  37. package/dist/merge/Deduplicator.d.ts +4 -0
  38. package/dist/merge/Deduplicator.d.ts.map +1 -0
  39. package/dist/merge/SmartDataMerger.d.ts +7 -0
  40. package/dist/merge/SmartDataMerger.d.ts.map +1 -0
  41. package/dist/parsers/collect.d.ts +7 -0
  42. package/dist/parsers/collect.d.ts.map +1 -0
  43. package/{src/parsers/index.ts → dist/parsers/index.d.ts} +1 -0
  44. package/dist/parsers/index.d.ts.map +1 -0
  45. package/dist/parsers/mime.d.ts +12 -0
  46. package/dist/parsers/mime.d.ts.map +1 -0
  47. package/dist/parsers/npm.d.ts +16 -0
  48. package/dist/parsers/npm.d.ts.map +1 -0
  49. package/dist/parsers/pdf.d.ts +36 -0
  50. package/dist/parsers/pdf.d.ts.map +1 -0
  51. package/dist/parsers/runner.d.ts +4 -0
  52. package/dist/parsers/runner.d.ts.map +1 -0
  53. package/dist/parsers/types.d.ts +27 -0
  54. package/dist/parsers/types.d.ts.map +1 -0
  55. package/dist/parsers.d.ts +1 -0
  56. package/dist/parsers.js +492 -0
  57. package/dist/parsers.js.map +1 -0
  58. package/dist/prompts/DeduplicationPrompt.d.ts +5 -0
  59. package/dist/prompts/DeduplicationPrompt.d.ts.map +1 -0
  60. package/dist/prompts/ExtractorPrompt.d.ts +6 -0
  61. package/dist/prompts/ExtractorPrompt.d.ts.map +1 -0
  62. package/dist/prompts/ParallelMergerPrompt.d.ts +5 -0
  63. package/dist/prompts/ParallelMergerPrompt.d.ts.map +1 -0
  64. package/dist/prompts/SequentialExtractorPrompt.d.ts +6 -0
  65. package/dist/prompts/SequentialExtractorPrompt.d.ts.map +1 -0
  66. package/dist/prompts/formatArtifacts.d.ts +3 -0
  67. package/dist/prompts/formatArtifacts.d.ts.map +1 -0
  68. package/dist/strategies/DoublePassAutoMergeStrategy.d.ts +23 -0
  69. package/dist/strategies/DoublePassAutoMergeStrategy.d.ts.map +1 -0
  70. package/dist/strategies/DoublePassStrategy.d.ts +22 -0
  71. package/dist/strategies/DoublePassStrategy.d.ts.map +1 -0
  72. package/dist/strategies/ParallelAutoMergeStrategy.d.ts +27 -0
  73. package/dist/strategies/ParallelAutoMergeStrategy.d.ts.map +1 -0
  74. package/dist/strategies/ParallelStrategy.d.ts +22 -0
  75. package/dist/strategies/ParallelStrategy.d.ts.map +1 -0
  76. package/dist/strategies/SequentialAutoMergeStrategy.d.ts +22 -0
  77. package/dist/strategies/SequentialAutoMergeStrategy.d.ts.map +1 -0
  78. package/dist/strategies/SequentialStrategy.d.ts +20 -0
  79. package/dist/strategies/SequentialStrategy.d.ts.map +1 -0
  80. package/dist/strategies/SimpleStrategy.d.ts +18 -0
  81. package/dist/strategies/SimpleStrategy.d.ts.map +1 -0
  82. package/dist/strategies/agent/AgentStrategy.d.ts +44 -0
  83. package/dist/strategies/agent/AgentStrategy.d.ts.map +1 -0
  84. package/dist/strategies/agent/AgentTools.d.ts +55 -0
  85. package/dist/strategies/agent/AgentTools.d.ts.map +1 -0
  86. package/dist/strategies/agent/ArtifactFilesystem.d.ts +51 -0
  87. package/dist/strategies/agent/ArtifactFilesystem.d.ts.map +1 -0
  88. package/dist/strategies/agent/index.d.ts +4 -0
  89. package/dist/strategies/agent/index.d.ts.map +1 -0
  90. package/dist/strategies/concurrency.d.ts +2 -0
  91. package/dist/strategies/concurrency.d.ts.map +1 -0
  92. package/{src/strategies/index.ts → dist/strategies/index.d.ts} +2 -0
  93. package/dist/strategies/index.d.ts.map +1 -0
  94. package/dist/strategies/utils.d.ts +39 -0
  95. package/dist/strategies/utils.d.ts.map +1 -0
  96. package/dist/strategies.d.ts +1 -0
  97. package/dist/strategies.js +3930 -0
  98. package/dist/strategies.js.map +1 -0
  99. package/dist/tokenization.d.ts +11 -0
  100. package/dist/tokenization.d.ts.map +1 -0
  101. package/dist/types.d.ts +178 -0
  102. package/dist/types.d.ts.map +1 -0
  103. package/dist/validation/validator.d.ts +20 -0
  104. package/dist/validation/validator.d.ts.map +1 -0
  105. package/package.json +30 -14
  106. package/src/agent-cli-integration.test.ts +0 -47
  107. package/src/agent-export.test.ts +0 -17
  108. package/src/agent-tool-labels.test.ts +0 -50
  109. package/src/artifacts/AGENTS.md +0 -16
  110. package/src/artifacts/fileToArtifact.test.ts +0 -37
  111. package/src/artifacts/fileToArtifact.ts +0 -44
  112. package/src/artifacts/input.test.ts +0 -243
  113. package/src/artifacts/input.ts +0 -360
  114. package/src/artifacts/providers.test.ts +0 -19
  115. package/src/artifacts/urlToArtifact.test.ts +0 -23
  116. package/src/artifacts/urlToArtifact.ts +0 -19
  117. package/src/auth/AGENTS.md +0 -11
  118. package/src/auth/config.test.ts +0 -132
  119. package/src/auth/config.ts +0 -186
  120. package/src/auth/tokens.test.ts +0 -58
  121. package/src/auth/tokens.ts +0 -229
  122. package/src/chunking/AGENTS.md +0 -11
  123. package/src/chunking/ArtifactBatcher.test.ts +0 -22
  124. package/src/chunking/ArtifactBatcher.ts +0 -110
  125. package/src/chunking/ArtifactSplitter.test.ts +0 -38
  126. package/src/chunking/ArtifactSplitter.ts +0 -151
  127. package/src/debug/AGENTS.md +0 -79
  128. package/src/debug/logger.test.ts +0 -244
  129. package/src/debug/logger.ts +0 -211
  130. package/src/extract.test.ts +0 -22
  131. package/src/extract.ts +0 -150
  132. package/src/fields.test.ts +0 -681
  133. package/src/fields.ts +0 -246
  134. package/src/index.test.ts +0 -20
  135. package/src/index.ts +0 -110
  136. package/src/llm/AGENTS.md +0 -9
  137. package/src/llm/LLMClient.test.ts +0 -394
  138. package/src/llm/LLMClient.ts +0 -264
  139. package/src/llm/RetryingRunner.test.ts +0 -174
  140. package/src/llm/RetryingRunner.ts +0 -270
  141. package/src/llm/message.test.ts +0 -42
  142. package/src/llm/message.ts +0 -47
  143. package/src/llm/models.test.ts +0 -82
  144. package/src/llm/models.ts +0 -190
  145. package/src/llm/resolveModel.ts +0 -86
  146. package/src/merge/AGENTS.md +0 -6
  147. package/src/merge/Deduplicator.test.ts +0 -108
  148. package/src/merge/Deduplicator.ts +0 -45
  149. package/src/merge/SmartDataMerger.test.ts +0 -177
  150. package/src/merge/SmartDataMerger.ts +0 -56
  151. package/src/parsers/AGENTS.md +0 -58
  152. package/src/parsers/collect.test.ts +0 -56
  153. package/src/parsers/collect.ts +0 -31
  154. package/src/parsers/mime.test.ts +0 -91
  155. package/src/parsers/mime.ts +0 -137
  156. package/src/parsers/npm.ts +0 -26
  157. package/src/parsers/pdf.test.ts +0 -394
  158. package/src/parsers/pdf.ts +0 -194
  159. package/src/parsers/runner.test.ts +0 -95
  160. package/src/parsers/runner.ts +0 -177
  161. package/src/parsers/types.ts +0 -29
  162. package/src/prompts/AGENTS.md +0 -8
  163. package/src/prompts/DeduplicationPrompt.test.ts +0 -41
  164. package/src/prompts/DeduplicationPrompt.ts +0 -37
  165. package/src/prompts/ExtractorPrompt.test.ts +0 -21
  166. package/src/prompts/ExtractorPrompt.ts +0 -72
  167. package/src/prompts/ParallelMergerPrompt.test.ts +0 -8
  168. package/src/prompts/ParallelMergerPrompt.ts +0 -37
  169. package/src/prompts/SequentialExtractorPrompt.test.ts +0 -24
  170. package/src/prompts/SequentialExtractorPrompt.ts +0 -82
  171. package/src/prompts/formatArtifacts.test.ts +0 -39
  172. package/src/prompts/formatArtifacts.ts +0 -46
  173. package/src/strategies/AGENTS.md +0 -6
  174. package/src/strategies/DoublePassAutoMergeStrategy.test.ts +0 -53
  175. package/src/strategies/DoublePassAutoMergeStrategy.ts +0 -410
  176. package/src/strategies/DoublePassStrategy.test.ts +0 -48
  177. package/src/strategies/DoublePassStrategy.ts +0 -266
  178. package/src/strategies/ParallelAutoMergeStrategy.test.ts +0 -152
  179. package/src/strategies/ParallelAutoMergeStrategy.ts +0 -345
  180. package/src/strategies/ParallelStrategy.test.ts +0 -61
  181. package/src/strategies/ParallelStrategy.ts +0 -208
  182. package/src/strategies/SequentialAutoMergeStrategy.test.ts +0 -66
  183. package/src/strategies/SequentialAutoMergeStrategy.ts +0 -325
  184. package/src/strategies/SequentialStrategy.test.ts +0 -53
  185. package/src/strategies/SequentialStrategy.ts +0 -142
  186. package/src/strategies/SimpleStrategy.test.ts +0 -46
  187. package/src/strategies/SimpleStrategy.ts +0 -94
  188. package/src/strategies/concurrency.test.ts +0 -16
  189. package/src/strategies/concurrency.ts +0 -14
  190. package/src/strategies/index.test.ts +0 -20
  191. package/src/strategies/utils.test.ts +0 -76
  192. package/src/strategies/utils.ts +0 -95
  193. package/src/tokenization.test.ts +0 -119
  194. package/src/tokenization.ts +0 -71
  195. package/src/types.test.ts +0 -25
  196. package/src/types.ts +0 -174
  197. package/src/validation/AGENTS.md +0 -7
  198. package/src/validation/validator.test.ts +0 -204
  199. package/src/validation/validator.ts +0 -90
  200. package/tsconfig.json +0 -22
@@ -1,58 +0,0 @@
1
- # Parsers module
2
-
3
- - Purpose: detect MIME types, run external/npm/command parsers, and provide built-in PDF support.
4
- - Key files: `types.ts`, `collect.ts`, `mime.ts`, `npm.ts`, `runner.ts`, `pdf.ts`, `index.ts`.
5
-
6
- ## Types (`types.ts`)
7
-
8
- - `NpmParserDef` — npm package parser definition (`type: "npm"`, `package: string`)
9
- - `CommandFileDef` — command with `FILE_PATH` placeholder (`type: "command-file"`, `command: string`)
10
- - `CommandStdinDef` — command that reads from stdin (`type: "command-stdin"`, `command: string`)
11
- - `InlineParserDef` — inline handler function (`type: "inline"`, `handler: (buffer: Buffer) => Promise<Artifact>`)
12
- - `ParserDef` — union of the four variants
13
- - `ParsersConfig` — `Record<string, ParserDef>` keyed by MIME type
14
- - `ParserInput` — `{ kind: "file"; path: string } | { kind: "buffer"; buffer: Buffer }`
15
-
16
- ## npm Contract (`npm.ts`)
17
-
18
- - `ParseStreamFn`, `ParseFileFn`, `DetectFileTypeFn`, `NpmParserModule` — interfaces that npm parser packages must implement.
19
- - At least one of `parseStream` or `parseFile` must be exported.
20
-
21
- ## collectStream (`collect.ts`)
22
-
23
- - `collectStream(stream: ReadableStream<Uint8Array>): Promise<Buffer>` — public utility for npm parser authors to collect a stream into a Buffer.
24
-
25
- ## MIME Detection (`mime.ts`)
26
-
27
- Two-layer detection + npm detectFileType callbacks:
28
-
29
- 1. **Magic bytes** (authoritative): PDF, PNG, JPEG, GIF, WebP, ZIP/Office
30
- 2. **npm `detectFileType`**: called after magic bytes with first 512 bytes
31
- 3. **Extension database**: fallback when no magic bytes match (file inputs only)
32
-
33
- `detectMimeType({ buffer?, filePath?, mimeOverride?, npmParsers? }): Promise<string | null>`
34
-
35
- ## Runner (`runner.ts`)
36
-
37
- `runParser(def: ParserDef, input: ParserInput, mimeType: string): Promise<Artifact[]>`
38
-
39
- - **npm**: Dynamic import, prefer `parseFile` for file inputs (zero-copy), prefer `parseStream` for buffer inputs. Falls back via temp-file if needed.
40
- - **command-file**: Interpolates `FILE_PATH` in command, writes temp file for buffer inputs.
41
- - **command-stdin**: Pipes input buffer to subprocess stdin; captures stdout as `SerializedArtifact[]` JSON.
42
- - **inline**: Calls the handler function directly with the buffer (reads file into buffer if needed).
43
-
44
- ## Built-in PDF Parser (`pdf.ts`)
45
-
46
- `parsePdf(input: Buffer | ReadableStream<Uint8Array>, options?: ParsePdfOptions): Promise<Artifact>`
47
-
48
- Uses `pdf-parse` (npm package). Extracts per-page text **and** embedded images into `ArtifactContent[]`
49
- with `page` numbers set. Returns an `Artifact` with `type: "pdf"`.
50
-
51
- - Text extraction: per-page via `parser.getText()`; falls back to full document text when no per-page info is available.
52
- - Image extraction: per-page via `parser.getImage({ imageBuffer: false, imageDataUrl: true })`. Each embedded image is mapped to an `ArtifactImage` with `base64` (raw base64 string, data-URL prefix stripped), `width`, `height`, and `imageType: "embedded"`. Images are merged into the `media` array of the matching `ArtifactContent` entry. Pages that contain images but no text produce their own content entry. Image extraction failure is non-fatal — the parser continues and returns text-only content.
53
- - Screenshot rendering: per-page via `parser.getScreenshot()`. Each page is rendered to a PNG image and added to the `media` array with `imageType: "screenshot"`. Screenshots are appended to any embedded images for the same page. Screenshot rendering failure is non-fatal — the parser continues without screenshots.
54
- - `imageThreshold` defaults to 80 px (from pdf-parse), filtering out tiny decorative images.
55
- - `ParsePdfOptions.includeImages` (default `true`): set to `false` to skip `getImage()` entirely and return text-only content. This is used by the `--no-images` CLI flag.
56
- - `ParsePdfOptions.screenshots` (default `false`): set to `true` to render page screenshots and include them as images. This is used by the `--screenshots` CLI flag.
57
- - `ParsePdfOptions.screenshotScale` (default `1.5`): scale factor for screenshot rendering. Higher values produce larger, higher-quality images.
58
- - `ParsePdfOptions.screenshotWidth`: target width in pixels for screenshots. If specified, takes precedence over `screenshotScale` and height is calculated to maintain aspect ratio.
@@ -1,56 +0,0 @@
1
- import { test, expect } from "bun:test";
2
- import { collectStream } from "./collect";
3
-
4
- test("collectStream collects single chunk", async () => {
5
- const data = Buffer.from("hello world");
6
- const stream = new ReadableStream<Uint8Array>({
7
- start(controller) {
8
- controller.enqueue(data);
9
- controller.close();
10
- },
11
- });
12
-
13
- const result = await collectStream(stream);
14
- expect(result.toString()).toBe("hello world");
15
- });
16
-
17
- test("collectStream collects multiple chunks", async () => {
18
- const chunks = [Buffer.from("foo"), Buffer.from("bar"), Buffer.from("baz")];
19
- const stream = new ReadableStream<Uint8Array>({
20
- start(controller) {
21
- for (const chunk of chunks) {
22
- controller.enqueue(chunk);
23
- }
24
- controller.close();
25
- },
26
- });
27
-
28
- const result = await collectStream(stream);
29
- expect(result.toString()).toBe("foobarbaz");
30
- });
31
-
32
- test("collectStream returns Buffer", async () => {
33
- const stream = new ReadableStream<Uint8Array>({
34
- start(controller) {
35
- controller.enqueue(new Uint8Array([1, 2, 3]));
36
- controller.close();
37
- },
38
- });
39
-
40
- const result = await collectStream(stream);
41
- expect(Buffer.isBuffer(result)).toBe(true);
42
- expect(result[0]).toBe(1);
43
- expect(result[1]).toBe(2);
44
- expect(result[2]).toBe(3);
45
- });
46
-
47
- test("collectStream handles empty stream", async () => {
48
- const stream = new ReadableStream<Uint8Array>({
49
- start(controller) {
50
- controller.close();
51
- },
52
- });
53
-
54
- const result = await collectStream(stream);
55
- expect(result.length).toBe(0);
56
- });
@@ -1,31 +0,0 @@
1
- /**
2
- * Collects a ReadableStream<Uint8Array> into a Buffer.
3
- * Uses Web Streams API — compatible with Bun and Node 18+.
4
- * Exported as a public utility for npm parser authors.
5
- */
6
- export async function collectStream(stream: ReadableStream<Uint8Array>): Promise<Buffer> {
7
- const reader = stream.getReader();
8
- const chunks: Uint8Array[] = [];
9
-
10
- try {
11
- while (true) {
12
- const { done, value } = await reader.read();
13
- if (done) {
14
- break;
15
- }
16
- chunks.push(value);
17
- }
18
- } finally {
19
- reader.releaseLock();
20
- }
21
-
22
- const totalLength = chunks.reduce((sum, chunk) => sum + chunk.length, 0);
23
- const result = new Uint8Array(totalLength);
24
- let offset = 0;
25
- for (const chunk of chunks) {
26
- result.set(chunk, offset);
27
- offset += chunk.length;
28
- }
29
-
30
- return Buffer.from(result);
31
- }
@@ -1,91 +0,0 @@
1
- import { test, expect } from "bun:test";
2
- import { detectMimeType } from "./mime";
3
-
4
- test("detectMimeType returns mimeOverride when provided", async () => {
5
- const result = await detectMimeType({ mimeOverride: "application/pdf" });
6
- expect(result).toBe("application/pdf");
7
- });
8
-
9
- test("detectMimeType detects PDF from magic bytes", async () => {
10
- const pdfHeader = Buffer.from([0x25, 0x50, 0x44, 0x46, 0x2d, 0x31, 0x2e, 0x34]);
11
- const result = await detectMimeType({ buffer: pdfHeader });
12
- expect(result).toBe("application/pdf");
13
- });
14
-
15
- test("detectMimeType detects PNG from magic bytes", async () => {
16
- const pngHeader = Buffer.from([0x89, 0x50, 0x4e, 0x47, 0x0d, 0x0a, 0x1a, 0x0a]);
17
- const result = await detectMimeType({ buffer: pngHeader });
18
- expect(result).toBe("image/png");
19
- });
20
-
21
- test("detectMimeType detects JPEG from magic bytes", async () => {
22
- const jpegHeader = Buffer.from([0xff, 0xd8, 0xff, 0xe0, 0x00, 0x10]);
23
- const result = await detectMimeType({ buffer: jpegHeader });
24
- expect(result).toBe("image/jpeg");
25
- });
26
-
27
- test("detectMimeType detects GIF from magic bytes", async () => {
28
- const gifHeader = Buffer.from([0x47, 0x49, 0x46, 0x38, 0x39, 0x61]);
29
- const result = await detectMimeType({ buffer: gifHeader });
30
- expect(result).toBe("image/gif");
31
- });
32
-
33
- test("detectMimeType detects WebP from magic bytes", async () => {
34
- const webpHeader = Buffer.alloc(12);
35
- // RIFF at offset 0
36
- webpHeader[0] = 0x52; webpHeader[1] = 0x49; webpHeader[2] = 0x46; webpHeader[3] = 0x46;
37
- // WEBP at offset 8
38
- webpHeader[8] = 0x57; webpHeader[9] = 0x45; webpHeader[10] = 0x42; webpHeader[11] = 0x50;
39
- const result = await detectMimeType({ buffer: webpHeader });
40
- expect(result).toBe("image/webp");
41
- });
42
-
43
- test("detectMimeType falls back to extension lookup for .txt", async () => {
44
- const result = await detectMimeType({ filePath: "/some/file.txt" });
45
- expect(result).toBe("text/plain");
46
- });
47
-
48
- test("detectMimeType falls back to extension lookup for .md", async () => {
49
- const result = await detectMimeType({ filePath: "README.md" });
50
- expect(result).toBe("text/markdown");
51
- });
52
-
53
- test("detectMimeType falls back to extension lookup for .json", async () => {
54
- const result = await detectMimeType({ filePath: "data.json" });
55
- expect(result).toBe("application/json");
56
- });
57
-
58
- test("detectMimeType falls back to extension lookup for .docx", async () => {
59
- const result = await detectMimeType({ filePath: "doc.docx" });
60
- expect(result).toBe("application/vnd.openxmlformats-officedocument.wordprocessingml.document");
61
- });
62
-
63
- test("detectMimeType returns null for unknown extension with no magic bytes", async () => {
64
- const result = await detectMimeType({ filePath: "file.xyz" });
65
- expect(result).toBeNull();
66
- });
67
-
68
- test("detectMimeType returns null with no inputs", async () => {
69
- const result = await detectMimeType({});
70
- expect(result).toBeNull();
71
- });
72
-
73
- test("detectMimeType mimeOverride takes precedence over magic bytes", async () => {
74
- const pdfHeader = Buffer.from([0x25, 0x50, 0x44, 0x46]);
75
- const result = await detectMimeType({
76
- buffer: pdfHeader,
77
- mimeOverride: "text/plain",
78
- });
79
- expect(result).toBe("text/plain");
80
- });
81
-
82
- test("detectMimeType extension takes precedence over null buffer detection", async () => {
83
- // Buffer with no magic bytes, but file has .pdf extension
84
- const randomBuffer = Buffer.from([0x00, 0x01, 0x02, 0x03]);
85
- const result = await detectMimeType({
86
- buffer: randomBuffer,
87
- filePath: "document.pdf",
88
- });
89
- // magic bytes don't match, falls back to extension
90
- expect(result).toBe("application/pdf");
91
- });
@@ -1,137 +0,0 @@
1
- import path from "node:path";
2
- import type { NpmParserDef } from "./types";
3
-
4
- // Magic byte signatures for common file types
5
- const MAGIC_BYTES: Array<{ mimeType: string; bytes: number[]; offset?: number }> = [
6
- // PDF: %PDF
7
- { mimeType: "application/pdf", bytes: [0x25, 0x50, 0x44, 0x46] },
8
- // PNG: 89 50 4E 47
9
- { mimeType: "image/png", bytes: [0x89, 0x50, 0x4e, 0x47] },
10
- // JPEG: FF D8 FF
11
- { mimeType: "image/jpeg", bytes: [0xff, 0xd8, 0xff] },
12
- // GIF: GIF8
13
- { mimeType: "image/gif", bytes: [0x47, 0x49, 0x46, 0x38] },
14
- // ZIP / Office Open XML (DOCX/XLSX/PPTX all start with PK\x03\x04)
15
- {
16
- mimeType: "application/zip",
17
- bytes: [0x50, 0x4b, 0x03, 0x04],
18
- },
19
- ];
20
-
21
- // WebP has RIFF at offset 0 and WEBP at offset 8
22
- const isWebP = (header: Uint8Array): boolean => {
23
- if (header.length < 12) return false;
24
- const riff =
25
- header[0] === 0x52 && header[1] === 0x49 && header[2] === 0x46 && header[3] === 0x46;
26
- const webp =
27
- header[8] === 0x57 && header[9] === 0x45 && header[10] === 0x42 && header[11] === 0x50;
28
- return riff && webp;
29
- };
30
-
31
- const matchesMagicBytes = (header: Uint8Array, bytes: number[], offset = 0): boolean => {
32
- if (header.length < offset + bytes.length) return false;
33
- return bytes.every((b, i) => header[offset + i] === b);
34
- };
35
-
36
- const detectFromMagicBytes = (header: Uint8Array): string | null => {
37
- if (isWebP(header)) return "image/webp";
38
-
39
- for (const { mimeType, bytes, offset } of MAGIC_BYTES) {
40
- if (matchesMagicBytes(header, bytes, offset ?? 0)) {
41
- return mimeType;
42
- }
43
- }
44
-
45
- return null;
46
- };
47
-
48
- // Extension → MIME type lookup
49
- const EXTENSION_MIME_MAP: Record<string, string> = {
50
- ".txt": "text/plain",
51
- ".md": "text/markdown",
52
- ".markdown": "text/markdown",
53
- ".html": "text/html",
54
- ".htm": "text/html",
55
- ".json": "application/json",
56
- ".pdf": "application/pdf",
57
- ".png": "image/png",
58
- ".jpg": "image/jpeg",
59
- ".jpeg": "image/jpeg",
60
- ".gif": "image/gif",
61
- ".webp": "image/webp",
62
- ".csv": "text/csv",
63
- ".xml": "application/xml",
64
- ".yaml": "application/yaml",
65
- ".yml": "application/yaml",
66
- ".docx":
67
- "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
68
- ".xlsx":
69
- "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
70
- ".pptx":
71
- "application/vnd.openxmlformats-officedocument.presentationml.presentation",
72
- ".mp4": "video/mp4",
73
- ".mp3": "audio/mpeg",
74
- ".wav": "audio/wav",
75
- ".ogg": "audio/ogg",
76
- ".svg": "image/svg+xml",
77
- ".ts": "text/plain",
78
- ".tsx": "text/plain",
79
- ".js": "text/javascript",
80
- ".jsx": "text/javascript",
81
- ".css": "text/css",
82
- ".toml": "application/toml",
83
- };
84
-
85
- export type NpmParserEntry = {
86
- mimeType: string;
87
- def: NpmParserDef;
88
- };
89
-
90
- export async function detectMimeType(options: {
91
- buffer?: Buffer;
92
- filePath?: string;
93
- mimeOverride?: string;
94
- npmParsers?: NpmParserEntry[];
95
- }): Promise<string | null> {
96
- const { buffer, filePath, mimeOverride, npmParsers } = options;
97
-
98
- // --mime override takes precedence
99
- if (mimeOverride) {
100
- return mimeOverride;
101
- }
102
-
103
- // Layer 1: magic bytes (authoritative)
104
- if (buffer && buffer.length > 0) {
105
- const header = buffer.subarray(0, 512);
106
- const magicMime = detectFromMagicBytes(header);
107
- if (magicMime) {
108
- return magicMime;
109
- }
110
-
111
- // Layer 3: npm parser detectFileType callbacks (after built-ins)
112
- if (npmParsers && npmParsers.length > 0) {
113
- for (const entry of npmParsers) {
114
- try {
115
- const mod = await import(entry.def.package) as {
116
- detectFileType?: (header: Uint8Array) => boolean;
117
- };
118
- if (typeof mod.detectFileType === "function" && mod.detectFileType(header)) {
119
- return entry.mimeType;
120
- }
121
- } catch {
122
- // If the package fails to load, skip it
123
- }
124
- }
125
- }
126
- }
127
-
128
- // Layer 2: extension database (for file inputs)
129
- if (filePath) {
130
- const ext = path.extname(filePath).toLowerCase();
131
- if (ext && ext in EXTENSION_MIME_MAP) {
132
- return EXTENSION_MIME_MAP[ext] ?? null;
133
- }
134
- }
135
-
136
- return null;
137
- }
@@ -1,26 +0,0 @@
1
- import type { Artifact } from "../types";
2
-
3
- /**
4
- * Contract for npm parser packages.
5
- *
6
- * A parser package must export at least one of `parseStream` or `parseFile`.
7
- * `detectFileType` is optional.
8
- */
9
-
10
- // Parser receives a ReadableStream — no disk I/O needed
11
- export type ParseStreamFn = (
12
- stream: ReadableStream<Uint8Array>,
13
- mimeType: string,
14
- ) => Promise<Artifact[]>;
15
-
16
- // Parser receives a file path — useful for libraries that only work with files
17
- export type ParseFileFn = (filePath: string, mimeType: string) => Promise<Artifact[]>;
18
-
19
- // Magic byte detection — optional; return true if this parser handles the given bytes
20
- export type DetectFileTypeFn = (header: Uint8Array) => boolean;
21
-
22
- export type NpmParserModule = {
23
- parseStream?: ParseStreamFn;
24
- parseFile?: ParseFileFn;
25
- detectFileType?: DetectFileTypeFn;
26
- };