@struktur/sdk 2.1.2 → 2.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +4111 -0
- package/dist/index.js.map +1 -0
- package/dist/parsers.js +492 -0
- package/dist/parsers.js.map +1 -0
- package/dist/strategies.js +2435 -0
- package/dist/strategies.js.map +1 -0
- package/package.json +24 -12
- package/src/agent-cli-integration.test.ts +0 -47
- package/src/agent-export.test.ts +0 -17
- package/src/agent-tool-labels.test.ts +0 -50
- package/src/artifacts/AGENTS.md +0 -16
- package/src/artifacts/fileToArtifact.test.ts +0 -37
- package/src/artifacts/fileToArtifact.ts +0 -44
- package/src/artifacts/input.test.ts +0 -243
- package/src/artifacts/input.ts +0 -360
- package/src/artifacts/providers.test.ts +0 -19
- package/src/artifacts/providers.ts +0 -7
- package/src/artifacts/urlToArtifact.test.ts +0 -23
- package/src/artifacts/urlToArtifact.ts +0 -19
- package/src/auth/AGENTS.md +0 -11
- package/src/auth/config.test.ts +0 -132
- package/src/auth/config.ts +0 -186
- package/src/auth/tokens.test.ts +0 -58
- package/src/auth/tokens.ts +0 -229
- package/src/chunking/AGENTS.md +0 -11
- package/src/chunking/ArtifactBatcher.test.ts +0 -22
- package/src/chunking/ArtifactBatcher.ts +0 -110
- package/src/chunking/ArtifactSplitter.test.ts +0 -38
- package/src/chunking/ArtifactSplitter.ts +0 -151
- package/src/debug/AGENTS.md +0 -79
- package/src/debug/logger.test.ts +0 -244
- package/src/debug/logger.ts +0 -211
- package/src/extract.test.ts +0 -22
- package/src/extract.ts +0 -150
- package/src/fields.test.ts +0 -681
- package/src/fields.ts +0 -246
- package/src/index.test.ts +0 -20
- package/src/index.ts +0 -110
- package/src/llm/AGENTS.md +0 -9
- package/src/llm/LLMClient.test.ts +0 -394
- package/src/llm/LLMClient.ts +0 -264
- package/src/llm/RetryingRunner.test.ts +0 -174
- package/src/llm/RetryingRunner.ts +0 -270
- package/src/llm/message.test.ts +0 -42
- package/src/llm/message.ts +0 -47
- package/src/llm/models.test.ts +0 -82
- package/src/llm/models.ts +0 -190
- package/src/llm/resolveModel.ts +0 -86
- package/src/merge/AGENTS.md +0 -6
- package/src/merge/Deduplicator.test.ts +0 -108
- package/src/merge/Deduplicator.ts +0 -45
- package/src/merge/SmartDataMerger.test.ts +0 -177
- package/src/merge/SmartDataMerger.ts +0 -56
- package/src/parsers/AGENTS.md +0 -58
- package/src/parsers/collect.test.ts +0 -56
- package/src/parsers/collect.ts +0 -31
- package/src/parsers/index.ts +0 -6
- package/src/parsers/mime.test.ts +0 -91
- package/src/parsers/mime.ts +0 -137
- package/src/parsers/npm.ts +0 -26
- package/src/parsers/pdf.test.ts +0 -394
- package/src/parsers/pdf.ts +0 -194
- package/src/parsers/runner.test.ts +0 -95
- package/src/parsers/runner.ts +0 -177
- package/src/parsers/types.ts +0 -29
- package/src/prompts/AGENTS.md +0 -8
- package/src/prompts/DeduplicationPrompt.test.ts +0 -41
- package/src/prompts/DeduplicationPrompt.ts +0 -37
- package/src/prompts/ExtractorPrompt.test.ts +0 -21
- package/src/prompts/ExtractorPrompt.ts +0 -72
- package/src/prompts/ParallelMergerPrompt.test.ts +0 -8
- package/src/prompts/ParallelMergerPrompt.ts +0 -37
- package/src/prompts/SequentialExtractorPrompt.test.ts +0 -24
- package/src/prompts/SequentialExtractorPrompt.ts +0 -82
- package/src/prompts/formatArtifacts.test.ts +0 -39
- package/src/prompts/formatArtifacts.ts +0 -46
- package/src/strategies/AGENTS.md +0 -6
- package/src/strategies/DoublePassAutoMergeStrategy.test.ts +0 -53
- package/src/strategies/DoublePassAutoMergeStrategy.ts +0 -410
- package/src/strategies/DoublePassStrategy.test.ts +0 -48
- package/src/strategies/DoublePassStrategy.ts +0 -266
- package/src/strategies/ParallelAutoMergeStrategy.test.ts +0 -152
- package/src/strategies/ParallelAutoMergeStrategy.ts +0 -345
- package/src/strategies/ParallelStrategy.test.ts +0 -61
- package/src/strategies/ParallelStrategy.ts +0 -208
- package/src/strategies/SequentialAutoMergeStrategy.test.ts +0 -66
- package/src/strategies/SequentialAutoMergeStrategy.ts +0 -325
- package/src/strategies/SequentialStrategy.test.ts +0 -53
- package/src/strategies/SequentialStrategy.ts +0 -142
- package/src/strategies/SimpleStrategy.test.ts +0 -46
- package/src/strategies/SimpleStrategy.ts +0 -94
- package/src/strategies/concurrency.test.ts +0 -16
- package/src/strategies/concurrency.ts +0 -14
- package/src/strategies/index.test.ts +0 -20
- package/src/strategies/index.ts +0 -7
- package/src/strategies/utils.test.ts +0 -76
- package/src/strategies/utils.ts +0 -95
- package/src/tokenization.test.ts +0 -119
- package/src/tokenization.ts +0 -71
- package/src/types.test.ts +0 -25
- package/src/types.ts +0 -174
- package/src/validation/AGENTS.md +0 -7
- package/src/validation/validator.test.ts +0 -204
- package/src/validation/validator.ts +0 -90
- package/tsconfig.json +0 -22
package/src/parsers/AGENTS.md
DELETED
|
@@ -1,58 +0,0 @@
|
|
|
1
|
-
# Parsers module
|
|
2
|
-
|
|
3
|
-
- Purpose: detect MIME types, run external/npm/command parsers, and provide built-in PDF support.
|
|
4
|
-
- Key files: `types.ts`, `collect.ts`, `mime.ts`, `npm.ts`, `runner.ts`, `pdf.ts`, `index.ts`.
|
|
5
|
-
|
|
6
|
-
## Types (`types.ts`)
|
|
7
|
-
|
|
8
|
-
- `NpmParserDef` — npm package parser definition (`type: "npm"`, `package: string`)
|
|
9
|
-
- `CommandFileDef` — command with `FILE_PATH` placeholder (`type: "command-file"`, `command: string`)
|
|
10
|
-
- `CommandStdinDef` — command that reads from stdin (`type: "command-stdin"`, `command: string`)
|
|
11
|
-
- `InlineParserDef` — inline handler function (`type: "inline"`, `handler: (buffer: Buffer) => Promise<Artifact>`)
|
|
12
|
-
- `ParserDef` — union of the four variants
|
|
13
|
-
- `ParsersConfig` — `Record<string, ParserDef>` keyed by MIME type
|
|
14
|
-
- `ParserInput` — `{ kind: "file"; path: string } | { kind: "buffer"; buffer: Buffer }`
|
|
15
|
-
|
|
16
|
-
## npm Contract (`npm.ts`)
|
|
17
|
-
|
|
18
|
-
- `ParseStreamFn`, `ParseFileFn`, `DetectFileTypeFn`, `NpmParserModule` — interfaces that npm parser packages must implement.
|
|
19
|
-
- At least one of `parseStream` or `parseFile` must be exported.
|
|
20
|
-
|
|
21
|
-
## collectStream (`collect.ts`)
|
|
22
|
-
|
|
23
|
-
- `collectStream(stream: ReadableStream<Uint8Array>): Promise<Buffer>` — public utility for npm parser authors to collect a stream into a Buffer.
|
|
24
|
-
|
|
25
|
-
## MIME Detection (`mime.ts`)
|
|
26
|
-
|
|
27
|
-
Two-layer detection + npm detectFileType callbacks:
|
|
28
|
-
|
|
29
|
-
1. **Magic bytes** (authoritative): PDF, PNG, JPEG, GIF, WebP, ZIP/Office
|
|
30
|
-
2. **npm `detectFileType`**: called after magic bytes with first 512 bytes
|
|
31
|
-
3. **Extension database**: fallback when no magic bytes match (file inputs only)
|
|
32
|
-
|
|
33
|
-
`detectMimeType({ buffer?, filePath?, mimeOverride?, npmParsers? }): Promise<string | null>`
|
|
34
|
-
|
|
35
|
-
## Runner (`runner.ts`)
|
|
36
|
-
|
|
37
|
-
`runParser(def: ParserDef, input: ParserInput, mimeType: string): Promise<Artifact[]>`
|
|
38
|
-
|
|
39
|
-
- **npm**: Dynamic import, prefer `parseFile` for file inputs (zero-copy), prefer `parseStream` for buffer inputs. Falls back via temp-file if needed.
|
|
40
|
-
- **command-file**: Interpolates `FILE_PATH` in command, writes temp file for buffer inputs.
|
|
41
|
-
- **command-stdin**: Pipes input buffer to subprocess stdin; captures stdout as `SerializedArtifact[]` JSON.
|
|
42
|
-
- **inline**: Calls the handler function directly with the buffer (reads file into buffer if needed).
|
|
43
|
-
|
|
44
|
-
## Built-in PDF Parser (`pdf.ts`)
|
|
45
|
-
|
|
46
|
-
`parsePdf(input: Buffer | ReadableStream<Uint8Array>, options?: ParsePdfOptions): Promise<Artifact>`
|
|
47
|
-
|
|
48
|
-
Uses `pdf-parse` (npm package). Extracts per-page text **and** embedded images into `ArtifactContent[]`
|
|
49
|
-
with `page` numbers set. Returns an `Artifact` with `type: "pdf"`.
|
|
50
|
-
|
|
51
|
-
- Text extraction: per-page via `parser.getText()`; falls back to full document text when no per-page info is available.
|
|
52
|
-
- Image extraction: per-page via `parser.getImage({ imageBuffer: false, imageDataUrl: true })`. Each embedded image is mapped to an `ArtifactImage` with `base64` (raw base64 string, data-URL prefix stripped), `width`, `height`, and `imageType: "embedded"`. Images are merged into the `media` array of the matching `ArtifactContent` entry. Pages that contain images but no text produce their own content entry. Image extraction failure is non-fatal — the parser continues and returns text-only content.
|
|
53
|
-
- Screenshot rendering: per-page via `parser.getScreenshot()`. Each page is rendered to a PNG image and added to the `media` array with `imageType: "screenshot"`. Screenshots are appended to any embedded images for the same page. Screenshot rendering failure is non-fatal — the parser continues without screenshots.
|
|
54
|
-
- `imageThreshold` defaults to 80 px (from pdf-parse), filtering out tiny decorative images.
|
|
55
|
-
- `ParsePdfOptions.includeImages` (default `true`): set to `false` to skip `getImage()` entirely and return text-only content. This is used by the `--no-images` CLI flag.
|
|
56
|
-
- `ParsePdfOptions.screenshots` (default `false`): set to `true` to render page screenshots and include them as images. This is used by the `--screenshots` CLI flag.
|
|
57
|
-
- `ParsePdfOptions.screenshotScale` (default `1.5`): scale factor for screenshot rendering. Higher values produce larger, higher-quality images.
|
|
58
|
-
- `ParsePdfOptions.screenshotWidth`: target width in pixels for screenshots. If specified, takes precedence over `screenshotScale` and height is calculated to maintain aspect ratio.
|
|
@@ -1,56 +0,0 @@
|
|
|
1
|
-
import { test, expect } from "bun:test";
|
|
2
|
-
import { collectStream } from "./collect";
|
|
3
|
-
|
|
4
|
-
test("collectStream collects single chunk", async () => {
|
|
5
|
-
const data = Buffer.from("hello world");
|
|
6
|
-
const stream = new ReadableStream<Uint8Array>({
|
|
7
|
-
start(controller) {
|
|
8
|
-
controller.enqueue(data);
|
|
9
|
-
controller.close();
|
|
10
|
-
},
|
|
11
|
-
});
|
|
12
|
-
|
|
13
|
-
const result = await collectStream(stream);
|
|
14
|
-
expect(result.toString()).toBe("hello world");
|
|
15
|
-
});
|
|
16
|
-
|
|
17
|
-
test("collectStream collects multiple chunks", async () => {
|
|
18
|
-
const chunks = [Buffer.from("foo"), Buffer.from("bar"), Buffer.from("baz")];
|
|
19
|
-
const stream = new ReadableStream<Uint8Array>({
|
|
20
|
-
start(controller) {
|
|
21
|
-
for (const chunk of chunks) {
|
|
22
|
-
controller.enqueue(chunk);
|
|
23
|
-
}
|
|
24
|
-
controller.close();
|
|
25
|
-
},
|
|
26
|
-
});
|
|
27
|
-
|
|
28
|
-
const result = await collectStream(stream);
|
|
29
|
-
expect(result.toString()).toBe("foobarbaz");
|
|
30
|
-
});
|
|
31
|
-
|
|
32
|
-
test("collectStream returns Buffer", async () => {
|
|
33
|
-
const stream = new ReadableStream<Uint8Array>({
|
|
34
|
-
start(controller) {
|
|
35
|
-
controller.enqueue(new Uint8Array([1, 2, 3]));
|
|
36
|
-
controller.close();
|
|
37
|
-
},
|
|
38
|
-
});
|
|
39
|
-
|
|
40
|
-
const result = await collectStream(stream);
|
|
41
|
-
expect(Buffer.isBuffer(result)).toBe(true);
|
|
42
|
-
expect(result[0]).toBe(1);
|
|
43
|
-
expect(result[1]).toBe(2);
|
|
44
|
-
expect(result[2]).toBe(3);
|
|
45
|
-
});
|
|
46
|
-
|
|
47
|
-
test("collectStream handles empty stream", async () => {
|
|
48
|
-
const stream = new ReadableStream<Uint8Array>({
|
|
49
|
-
start(controller) {
|
|
50
|
-
controller.close();
|
|
51
|
-
},
|
|
52
|
-
});
|
|
53
|
-
|
|
54
|
-
const result = await collectStream(stream);
|
|
55
|
-
expect(result.length).toBe(0);
|
|
56
|
-
});
|
package/src/parsers/collect.ts
DELETED
|
@@ -1,31 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Collects a ReadableStream<Uint8Array> into a Buffer.
|
|
3
|
-
* Uses Web Streams API — compatible with Bun and Node 18+.
|
|
4
|
-
* Exported as a public utility for npm parser authors.
|
|
5
|
-
*/
|
|
6
|
-
export async function collectStream(stream: ReadableStream<Uint8Array>): Promise<Buffer> {
|
|
7
|
-
const reader = stream.getReader();
|
|
8
|
-
const chunks: Uint8Array[] = [];
|
|
9
|
-
|
|
10
|
-
try {
|
|
11
|
-
while (true) {
|
|
12
|
-
const { done, value } = await reader.read();
|
|
13
|
-
if (done) {
|
|
14
|
-
break;
|
|
15
|
-
}
|
|
16
|
-
chunks.push(value);
|
|
17
|
-
}
|
|
18
|
-
} finally {
|
|
19
|
-
reader.releaseLock();
|
|
20
|
-
}
|
|
21
|
-
|
|
22
|
-
const totalLength = chunks.reduce((sum, chunk) => sum + chunk.length, 0);
|
|
23
|
-
const result = new Uint8Array(totalLength);
|
|
24
|
-
let offset = 0;
|
|
25
|
-
for (const chunk of chunks) {
|
|
26
|
-
result.set(chunk, offset);
|
|
27
|
-
offset += chunk.length;
|
|
28
|
-
}
|
|
29
|
-
|
|
30
|
-
return Buffer.from(result);
|
|
31
|
-
}
|
package/src/parsers/index.ts
DELETED
|
@@ -1,6 +0,0 @@
|
|
|
1
|
-
export type { ParserDef, ParsersConfig, NpmParserDef, CommandFileDef, CommandStdinDef, InlineParserDef, ParserInput } from "./types";
|
|
2
|
-
export { runParser } from "./runner";
|
|
3
|
-
export { detectMimeType } from "./mime";
|
|
4
|
-
export { collectStream } from "./collect";
|
|
5
|
-
export { parsePdf } from "./pdf";
|
|
6
|
-
export type { ParsePdfOptions } from "./pdf";
|
package/src/parsers/mime.test.ts
DELETED
|
@@ -1,91 +0,0 @@
|
|
|
1
|
-
import { test, expect } from "bun:test";
|
|
2
|
-
import { detectMimeType } from "./mime";
|
|
3
|
-
|
|
4
|
-
test("detectMimeType returns mimeOverride when provided", async () => {
|
|
5
|
-
const result = await detectMimeType({ mimeOverride: "application/pdf" });
|
|
6
|
-
expect(result).toBe("application/pdf");
|
|
7
|
-
});
|
|
8
|
-
|
|
9
|
-
test("detectMimeType detects PDF from magic bytes", async () => {
|
|
10
|
-
const pdfHeader = Buffer.from([0x25, 0x50, 0x44, 0x46, 0x2d, 0x31, 0x2e, 0x34]);
|
|
11
|
-
const result = await detectMimeType({ buffer: pdfHeader });
|
|
12
|
-
expect(result).toBe("application/pdf");
|
|
13
|
-
});
|
|
14
|
-
|
|
15
|
-
test("detectMimeType detects PNG from magic bytes", async () => {
|
|
16
|
-
const pngHeader = Buffer.from([0x89, 0x50, 0x4e, 0x47, 0x0d, 0x0a, 0x1a, 0x0a]);
|
|
17
|
-
const result = await detectMimeType({ buffer: pngHeader });
|
|
18
|
-
expect(result).toBe("image/png");
|
|
19
|
-
});
|
|
20
|
-
|
|
21
|
-
test("detectMimeType detects JPEG from magic bytes", async () => {
|
|
22
|
-
const jpegHeader = Buffer.from([0xff, 0xd8, 0xff, 0xe0, 0x00, 0x10]);
|
|
23
|
-
const result = await detectMimeType({ buffer: jpegHeader });
|
|
24
|
-
expect(result).toBe("image/jpeg");
|
|
25
|
-
});
|
|
26
|
-
|
|
27
|
-
test("detectMimeType detects GIF from magic bytes", async () => {
|
|
28
|
-
const gifHeader = Buffer.from([0x47, 0x49, 0x46, 0x38, 0x39, 0x61]);
|
|
29
|
-
const result = await detectMimeType({ buffer: gifHeader });
|
|
30
|
-
expect(result).toBe("image/gif");
|
|
31
|
-
});
|
|
32
|
-
|
|
33
|
-
test("detectMimeType detects WebP from magic bytes", async () => {
|
|
34
|
-
const webpHeader = Buffer.alloc(12);
|
|
35
|
-
// RIFF at offset 0
|
|
36
|
-
webpHeader[0] = 0x52; webpHeader[1] = 0x49; webpHeader[2] = 0x46; webpHeader[3] = 0x46;
|
|
37
|
-
// WEBP at offset 8
|
|
38
|
-
webpHeader[8] = 0x57; webpHeader[9] = 0x45; webpHeader[10] = 0x42; webpHeader[11] = 0x50;
|
|
39
|
-
const result = await detectMimeType({ buffer: webpHeader });
|
|
40
|
-
expect(result).toBe("image/webp");
|
|
41
|
-
});
|
|
42
|
-
|
|
43
|
-
test("detectMimeType falls back to extension lookup for .txt", async () => {
|
|
44
|
-
const result = await detectMimeType({ filePath: "/some/file.txt" });
|
|
45
|
-
expect(result).toBe("text/plain");
|
|
46
|
-
});
|
|
47
|
-
|
|
48
|
-
test("detectMimeType falls back to extension lookup for .md", async () => {
|
|
49
|
-
const result = await detectMimeType({ filePath: "README.md" });
|
|
50
|
-
expect(result).toBe("text/markdown");
|
|
51
|
-
});
|
|
52
|
-
|
|
53
|
-
test("detectMimeType falls back to extension lookup for .json", async () => {
|
|
54
|
-
const result = await detectMimeType({ filePath: "data.json" });
|
|
55
|
-
expect(result).toBe("application/json");
|
|
56
|
-
});
|
|
57
|
-
|
|
58
|
-
test("detectMimeType falls back to extension lookup for .docx", async () => {
|
|
59
|
-
const result = await detectMimeType({ filePath: "doc.docx" });
|
|
60
|
-
expect(result).toBe("application/vnd.openxmlformats-officedocument.wordprocessingml.document");
|
|
61
|
-
});
|
|
62
|
-
|
|
63
|
-
test("detectMimeType returns null for unknown extension with no magic bytes", async () => {
|
|
64
|
-
const result = await detectMimeType({ filePath: "file.xyz" });
|
|
65
|
-
expect(result).toBeNull();
|
|
66
|
-
});
|
|
67
|
-
|
|
68
|
-
test("detectMimeType returns null with no inputs", async () => {
|
|
69
|
-
const result = await detectMimeType({});
|
|
70
|
-
expect(result).toBeNull();
|
|
71
|
-
});
|
|
72
|
-
|
|
73
|
-
test("detectMimeType mimeOverride takes precedence over magic bytes", async () => {
|
|
74
|
-
const pdfHeader = Buffer.from([0x25, 0x50, 0x44, 0x46]);
|
|
75
|
-
const result = await detectMimeType({
|
|
76
|
-
buffer: pdfHeader,
|
|
77
|
-
mimeOverride: "text/plain",
|
|
78
|
-
});
|
|
79
|
-
expect(result).toBe("text/plain");
|
|
80
|
-
});
|
|
81
|
-
|
|
82
|
-
test("detectMimeType extension takes precedence over null buffer detection", async () => {
|
|
83
|
-
// Buffer with no magic bytes, but file has .pdf extension
|
|
84
|
-
const randomBuffer = Buffer.from([0x00, 0x01, 0x02, 0x03]);
|
|
85
|
-
const result = await detectMimeType({
|
|
86
|
-
buffer: randomBuffer,
|
|
87
|
-
filePath: "document.pdf",
|
|
88
|
-
});
|
|
89
|
-
// magic bytes don't match, falls back to extension
|
|
90
|
-
expect(result).toBe("application/pdf");
|
|
91
|
-
});
|
package/src/parsers/mime.ts
DELETED
|
@@ -1,137 +0,0 @@
|
|
|
1
|
-
import path from "node:path";
|
|
2
|
-
import type { NpmParserDef } from "./types";
|
|
3
|
-
|
|
4
|
-
// Magic byte signatures for common file types
|
|
5
|
-
const MAGIC_BYTES: Array<{ mimeType: string; bytes: number[]; offset?: number }> = [
|
|
6
|
-
// PDF: %PDF
|
|
7
|
-
{ mimeType: "application/pdf", bytes: [0x25, 0x50, 0x44, 0x46] },
|
|
8
|
-
// PNG: 89 50 4E 47
|
|
9
|
-
{ mimeType: "image/png", bytes: [0x89, 0x50, 0x4e, 0x47] },
|
|
10
|
-
// JPEG: FF D8 FF
|
|
11
|
-
{ mimeType: "image/jpeg", bytes: [0xff, 0xd8, 0xff] },
|
|
12
|
-
// GIF: GIF8
|
|
13
|
-
{ mimeType: "image/gif", bytes: [0x47, 0x49, 0x46, 0x38] },
|
|
14
|
-
// ZIP / Office Open XML (DOCX/XLSX/PPTX all start with PK\x03\x04)
|
|
15
|
-
{
|
|
16
|
-
mimeType: "application/zip",
|
|
17
|
-
bytes: [0x50, 0x4b, 0x03, 0x04],
|
|
18
|
-
},
|
|
19
|
-
];
|
|
20
|
-
|
|
21
|
-
// WebP has RIFF at offset 0 and WEBP at offset 8
|
|
22
|
-
const isWebP = (header: Uint8Array): boolean => {
|
|
23
|
-
if (header.length < 12) return false;
|
|
24
|
-
const riff =
|
|
25
|
-
header[0] === 0x52 && header[1] === 0x49 && header[2] === 0x46 && header[3] === 0x46;
|
|
26
|
-
const webp =
|
|
27
|
-
header[8] === 0x57 && header[9] === 0x45 && header[10] === 0x42 && header[11] === 0x50;
|
|
28
|
-
return riff && webp;
|
|
29
|
-
};
|
|
30
|
-
|
|
31
|
-
const matchesMagicBytes = (header: Uint8Array, bytes: number[], offset = 0): boolean => {
|
|
32
|
-
if (header.length < offset + bytes.length) return false;
|
|
33
|
-
return bytes.every((b, i) => header[offset + i] === b);
|
|
34
|
-
};
|
|
35
|
-
|
|
36
|
-
const detectFromMagicBytes = (header: Uint8Array): string | null => {
|
|
37
|
-
if (isWebP(header)) return "image/webp";
|
|
38
|
-
|
|
39
|
-
for (const { mimeType, bytes, offset } of MAGIC_BYTES) {
|
|
40
|
-
if (matchesMagicBytes(header, bytes, offset ?? 0)) {
|
|
41
|
-
return mimeType;
|
|
42
|
-
}
|
|
43
|
-
}
|
|
44
|
-
|
|
45
|
-
return null;
|
|
46
|
-
};
|
|
47
|
-
|
|
48
|
-
// Extension → MIME type lookup
|
|
49
|
-
const EXTENSION_MIME_MAP: Record<string, string> = {
|
|
50
|
-
".txt": "text/plain",
|
|
51
|
-
".md": "text/markdown",
|
|
52
|
-
".markdown": "text/markdown",
|
|
53
|
-
".html": "text/html",
|
|
54
|
-
".htm": "text/html",
|
|
55
|
-
".json": "application/json",
|
|
56
|
-
".pdf": "application/pdf",
|
|
57
|
-
".png": "image/png",
|
|
58
|
-
".jpg": "image/jpeg",
|
|
59
|
-
".jpeg": "image/jpeg",
|
|
60
|
-
".gif": "image/gif",
|
|
61
|
-
".webp": "image/webp",
|
|
62
|
-
".csv": "text/csv",
|
|
63
|
-
".xml": "application/xml",
|
|
64
|
-
".yaml": "application/yaml",
|
|
65
|
-
".yml": "application/yaml",
|
|
66
|
-
".docx":
|
|
67
|
-
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
|
68
|
-
".xlsx":
|
|
69
|
-
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
|
70
|
-
".pptx":
|
|
71
|
-
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
|
72
|
-
".mp4": "video/mp4",
|
|
73
|
-
".mp3": "audio/mpeg",
|
|
74
|
-
".wav": "audio/wav",
|
|
75
|
-
".ogg": "audio/ogg",
|
|
76
|
-
".svg": "image/svg+xml",
|
|
77
|
-
".ts": "text/plain",
|
|
78
|
-
".tsx": "text/plain",
|
|
79
|
-
".js": "text/javascript",
|
|
80
|
-
".jsx": "text/javascript",
|
|
81
|
-
".css": "text/css",
|
|
82
|
-
".toml": "application/toml",
|
|
83
|
-
};
|
|
84
|
-
|
|
85
|
-
export type NpmParserEntry = {
|
|
86
|
-
mimeType: string;
|
|
87
|
-
def: NpmParserDef;
|
|
88
|
-
};
|
|
89
|
-
|
|
90
|
-
export async function detectMimeType(options: {
|
|
91
|
-
buffer?: Buffer;
|
|
92
|
-
filePath?: string;
|
|
93
|
-
mimeOverride?: string;
|
|
94
|
-
npmParsers?: NpmParserEntry[];
|
|
95
|
-
}): Promise<string | null> {
|
|
96
|
-
const { buffer, filePath, mimeOverride, npmParsers } = options;
|
|
97
|
-
|
|
98
|
-
// --mime override takes precedence
|
|
99
|
-
if (mimeOverride) {
|
|
100
|
-
return mimeOverride;
|
|
101
|
-
}
|
|
102
|
-
|
|
103
|
-
// Layer 1: magic bytes (authoritative)
|
|
104
|
-
if (buffer && buffer.length > 0) {
|
|
105
|
-
const header = buffer.subarray(0, 512);
|
|
106
|
-
const magicMime = detectFromMagicBytes(header);
|
|
107
|
-
if (magicMime) {
|
|
108
|
-
return magicMime;
|
|
109
|
-
}
|
|
110
|
-
|
|
111
|
-
// Layer 3: npm parser detectFileType callbacks (after built-ins)
|
|
112
|
-
if (npmParsers && npmParsers.length > 0) {
|
|
113
|
-
for (const entry of npmParsers) {
|
|
114
|
-
try {
|
|
115
|
-
const mod = await import(entry.def.package) as {
|
|
116
|
-
detectFileType?: (header: Uint8Array) => boolean;
|
|
117
|
-
};
|
|
118
|
-
if (typeof mod.detectFileType === "function" && mod.detectFileType(header)) {
|
|
119
|
-
return entry.mimeType;
|
|
120
|
-
}
|
|
121
|
-
} catch {
|
|
122
|
-
// If the package fails to load, skip it
|
|
123
|
-
}
|
|
124
|
-
}
|
|
125
|
-
}
|
|
126
|
-
}
|
|
127
|
-
|
|
128
|
-
// Layer 2: extension database (for file inputs)
|
|
129
|
-
if (filePath) {
|
|
130
|
-
const ext = path.extname(filePath).toLowerCase();
|
|
131
|
-
if (ext && ext in EXTENSION_MIME_MAP) {
|
|
132
|
-
return EXTENSION_MIME_MAP[ext] ?? null;
|
|
133
|
-
}
|
|
134
|
-
}
|
|
135
|
-
|
|
136
|
-
return null;
|
|
137
|
-
}
|
package/src/parsers/npm.ts
DELETED
|
@@ -1,26 +0,0 @@
|
|
|
1
|
-
import type { Artifact } from "../types";
|
|
2
|
-
|
|
3
|
-
/**
|
|
4
|
-
* Contract for npm parser packages.
|
|
5
|
-
*
|
|
6
|
-
* A parser package must export at least one of `parseStream` or `parseFile`.
|
|
7
|
-
* `detectFileType` is optional.
|
|
8
|
-
*/
|
|
9
|
-
|
|
10
|
-
// Parser receives a ReadableStream — no disk I/O needed
|
|
11
|
-
export type ParseStreamFn = (
|
|
12
|
-
stream: ReadableStream<Uint8Array>,
|
|
13
|
-
mimeType: string,
|
|
14
|
-
) => Promise<Artifact[]>;
|
|
15
|
-
|
|
16
|
-
// Parser receives a file path — useful for libraries that only work with files
|
|
17
|
-
export type ParseFileFn = (filePath: string, mimeType: string) => Promise<Artifact[]>;
|
|
18
|
-
|
|
19
|
-
// Magic byte detection — optional; return true if this parser handles the given bytes
|
|
20
|
-
export type DetectFileTypeFn = (header: Uint8Array) => boolean;
|
|
21
|
-
|
|
22
|
-
export type NpmParserModule = {
|
|
23
|
-
parseStream?: ParseStreamFn;
|
|
24
|
-
parseFile?: ParseFileFn;
|
|
25
|
-
detectFileType?: DetectFileTypeFn;
|
|
26
|
-
};
|