@mlx-node/vlm 0.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md ADDED
@@ -0,0 +1,266 @@
1
+ # @mlx-node/vlm
2
+
3
+ Vision-language models and document processing pipelines for Node.js on Apple Silicon. Extract text, tables, and structure from documents and images using PaddleOCR-VL and the PP-StructureV3 pipeline — all running locally on Metal GPU.
4
+
5
+ ## Requirements
6
+
7
+ - macOS with Apple Silicon (M1 or later)
8
+ - Node.js 18+
9
+
10
+ ## Installation
11
+
12
+ ```bash
13
+ npm install @mlx-node/vlm
14
+ ```
15
+
16
+ ## Quick Start
17
+
18
+ ### Document Structure Analysis
19
+
20
+ Use `StructureV3Pipeline` for layout detection + OCR without a VLM:
21
+
22
+ ```typescript
23
+ import { StructureV3Pipeline } from '@mlx-node/vlm';
24
+
25
+ const pipeline = await StructureV3Pipeline.load({
26
+ layoutModelPath: './models/PP-DocLayoutV3',
27
+ textDetModelPath: './models/PP-OCRv5-det',
28
+ textRecModelPath: './models/PP-OCRv5-rec',
29
+ dictPath: './models/PP-OCRv5-rec/en_dict.txt',
30
+ });
31
+
32
+ const result = await pipeline.analyze('./document.png');
33
+ console.log(result.markdown);
34
+ ```
35
+
36
+ ### VLM-based OCR
37
+
38
+ Use `VLModel` for higher-quality OCR with document understanding:
39
+
40
+ ```typescript
41
+ import { VLModel, parsePaddleResponse } from '@mlx-node/vlm';
42
+
43
+ const model = await VLModel.load('./models/PaddleOCR-VL-1.5-mlx');
44
+
45
+ const result = model.chat([{ role: 'user', content: 'Read the text in this image.' }], {
46
+ images: [imageBuffer],
47
+ maxNewTokens: 2048,
48
+ });
49
+
50
+ const formatted = parsePaddleResponse(result.text, { format: 'markdown' });
51
+ console.log(formatted);
52
+ ```
53
+
54
+ ## StructureV3Pipeline
55
+
56
+ The PP-StructureV3 pipeline combines multiple specialized models for fast, accurate document processing:
57
+
58
+ ```
59
+ Input Image
60
+
61
+ ▼ (optional)
62
+ Orientation Correction → Dewarping
63
+
64
+
65
+ Layout Detection (25 categories)
66
+
67
+
68
+ Per-element Cropping
69
+
70
+
71
+ Text Detection → Text Recognition
72
+
73
+
74
+ Markdown Assembly
75
+ ```
76
+
77
+ ### Pipeline Options
78
+
79
+ ```typescript
80
+ const result = await pipeline.analyze(imageBuffer, {
81
+ layoutThreshold: 0.5, // Layout detection confidence threshold
82
+ textDetThreshold: 0.3, // Text detection confidence threshold
83
+ includeDetails: true, // Include per-element bounding boxes
84
+ useDocOrientationClassify: true, // Auto-correct rotation (requires DocOrientationModel)
85
+ useDocUnwarping: true, // Auto-correct perspective (requires DocUnwarpModel)
86
+ });
87
+
88
+ // result.elements — array of StructuredElement with label, bbox, text lines
89
+ // result.markdown — assembled markdown document
90
+ ```
91
+
92
+ ### OCR Without Layout
93
+
94
+ Run OCR directly on a pre-cropped image:
95
+
96
+ ```typescript
97
+ const lines = await pipeline.ocrImage(croppedImageBuffer);
98
+ for (const line of lines) {
99
+ console.log(`${line.text} (confidence: ${line.score.toFixed(2)})`);
100
+ }
101
+ ```
102
+
103
+ ## Individual Models
104
+
105
+ Each pipeline model can be used independently:
106
+
107
+ ### DocLayoutModel — Document Layout Detection
108
+
109
+ PP-DocLayoutV3 with RT-DETR architecture, 25 layout categories (title, text, table, figure, formula, header, footer, etc.):
110
+
111
+ ```typescript
112
+ import { DocLayoutModel } from '@mlx-node/vlm';
113
+
114
+ const layout = await DocLayoutModel.load('./models/PP-DocLayoutV3');
115
+ const elements = layout.detect(imageBuffer, 0.5); // threshold
116
+
117
+ for (const el of elements) {
118
+ console.log(`${el.labelName} (${el.score.toFixed(2)}) at [${el.bbox}]`);
119
+ }
120
+ ```
121
+
122
+ ### TextDetModel — Text Line Detection
123
+
124
+ PP-OCRv5 DBNet with PPHGNetV2 backbone:
125
+
126
+ ```typescript
127
+ import { TextDetModel } from '@mlx-node/vlm';
128
+
129
+ const detector = await TextDetModel.load('./models/PP-OCRv5-det');
130
+ const boxes = detector.detect(imageBuffer);
131
+ ```
132
+
133
+ ### TextRecModel — Text Recognition
134
+
135
+ PP-OCRv5 SVTR neck + CTC head:
136
+
137
+ ```typescript
138
+ import { TextRecModel } from '@mlx-node/vlm';
139
+
140
+ const recognizer = await TextRecModel.load('./models/PP-OCRv5-rec');
141
+ const results = recognizer.recognizeBatch(croppedImages);
142
+ // [{ text: "Hello world", score: 0.98 }, ...]
143
+ ```
144
+
145
+ ### DocOrientationModel — Orientation Classification
146
+
147
+ Classifies document rotation (0/90/180/270 degrees):
148
+
149
+ ```typescript
150
+ import { DocOrientationModel } from '@mlx-node/vlm';
151
+
152
+ const classifier = await DocOrientationModel.load('./models/PP-LCNet_x1_0_doc_ori-mlx');
153
+ const { angle, score } = classifier.classify(imageBuffer);
154
+ const { image } = classifier.classifyAndRotate(imageBuffer); // auto-correct
155
+ ```
156
+
157
+ ### DocUnwarpModel — Document Dewarping
158
+
159
+ UVDocNet-based perspective correction:
160
+
161
+ ```typescript
162
+ import { DocUnwarpModel } from '@mlx-node/vlm';
163
+
164
+ const unwarper = await DocUnwarpModel.load('./models/UVDoc-mlx');
165
+ const { image } = unwarper.unwarp(imageBuffer);
166
+ ```
167
+
168
+ ## VLModel — Vision-Language Model
169
+
170
+ PaddleOCR-VL architecture (ERNIE language model + vision encoder):
171
+
172
+ ```typescript
173
+ import { VLModel } from '@mlx-node/vlm';
174
+
175
+ const model = await VLModel.load('./models/PaddleOCR-VL-1.5-mlx');
176
+
177
+ // Single image chat
178
+ const result = model.chat(messages, {
179
+ images: [imageBuffer],
180
+ maxNewTokens: 2048,
181
+ temperature: 0.1,
182
+ });
183
+
184
+ // Simple OCR
185
+ const text = model.ocr(imageBuffer);
186
+
187
+ // Batch OCR (multiple images)
188
+ const results = model.ocrBatch([image1, image2, image3]);
189
+
190
+ // Batch chat (different prompts per image)
191
+ const batchResults = model.batch([
192
+ { messages: tableMessages, images: [tableImage] },
193
+ { messages: formulaMessages, images: [formulaImage] },
194
+ ]);
195
+ ```
196
+
197
+ ## Output Parsing
198
+
199
+ Parse VLM output into structured documents:
200
+
201
+ ```typescript
202
+ import { parseVlmOutput, formatDocument, parsePaddleResponse, OutputFormat } from '@mlx-node/vlm';
203
+
204
+ // Two-step: parse then format
205
+ const doc = parseVlmOutput(result.text);
206
+ const markdown = formatDocument(doc, { format: OutputFormat.Markdown });
207
+
208
+ // One-step: parse and format
209
+ const html = parsePaddleResponse(result.text, { format: 'html' });
210
+ ```
211
+
212
+ ### XLSX Export
213
+
214
+ ```typescript
215
+ import { saveToXlsx, documentToXlsx } from '@mlx-node/vlm';
216
+
217
+ // Direct save
218
+ saveToXlsx(result.text, './output.xlsx');
219
+
220
+ // Or get buffer
221
+ const doc = parseVlmOutput(result.text);
222
+ const buffer = documentToXlsx(doc);
223
+ ```
224
+
225
+ ## API Reference
226
+
227
+ ### Pipeline
228
+
229
+ | Class/Function | Description |
230
+ | -------------------------------------- | ----------------------------------------------------------------- |
231
+ | `StructureV3Pipeline.load(config)` | Load pipeline with layout, text detection, and recognition models |
232
+ | `pipeline.analyze(image, options?)` | Full document analysis returning `StructuredDocument` |
233
+ | `pipeline.ocrImage(image, threshold?)` | OCR on a single cropped image |
234
+
235
+ ### Models
236
+
237
+ | Class | Description |
238
+ | --------------------- | ------------------------------------------------------------------------------- |
239
+ | `VLModel` | PaddleOCR-VL vision-language model — `chat()`, `ocr()`, `ocrBatch()`, `batch()` |
240
+ | `DocLayoutModel` | PP-DocLayoutV3 layout detection — `detect()` |
241
+ | `TextDetModel` | PP-OCRv5 text detection — `detect()`, `detectCrop()` |
242
+ | `TextRecModel` | PP-OCRv5 text recognition — `recognize()`, `recognizeBatch()` |
243
+ | `DocOrientationModel` | Orientation classifier — `classify()`, `classifyAndRotate()` |
244
+ | `DocUnwarpModel` | Document dewarping — `unwarp()` |
245
+
246
+ ### Output
247
+
248
+ | Function | Description |
249
+ | ------------------------------------ | ---------------------------------------- |
250
+ | `parseVlmOutput(text)` | Parse raw VLM text to `ParsedDocument` |
251
+ | `formatDocument(doc, config?)` | Format to markdown, HTML, plain, or JSON |
252
+ | `parsePaddleResponse(text, config?)` | Parse and format in one step |
253
+ | `documentToXlsx(doc)` | Convert parsed document to XLSX buffer |
254
+ | `saveToXlsx(text, path)` | Parse and save to XLSX file |
255
+
256
+ ### Configs
257
+
258
+ | Export | Description |
259
+ | --------------------------- | ------------------------------------------------ |
260
+ | `PADDLEOCR_VL_CONFIGS` | Pre-defined PaddleOCR-VL 1.5 config |
261
+ | `createPaddleocrVlConfig()` | Default config factory |
262
+ | `OutputFormat` | Enum: `Raw`, `Plain`, `Markdown`, `Html`, `Json` |
263
+
264
+ ## License
265
+
266
+ [MIT](https://github.com/mlx-node/mlx-node/blob/main/LICENSE)
@@ -0,0 +1,41 @@
1
+ /**
2
+ * @mlx-node/vlm - Vision Language Model support for MLX-Node
3
+ *
4
+ * This package provides VLM capabilities including:
5
+ * - VLModel for OCR and document understanding tasks
6
+ *
7
+ * @example
8
+ * ```typescript
9
+ * import { VLModel } from '@mlx-node/vlm';
10
+ *
11
+ * // Load a model
12
+ * const model = await VLModel.load('./models/paddleocr-vl');
13
+ *
14
+ * // Chat with images
15
+ * const imageBuffer = readFileSync('./photo.jpg');
16
+ * const result = await model.chat(
17
+ * [{ role: 'user', content: 'What is in this image?' }],
18
+ * { images: [imageBuffer] }
19
+ * );
20
+ * console.log(result.text);
21
+ *
22
+ * // Simple OCR
23
+ * const text = await model.ocr(readFileSync('./document.jpg'));
24
+ *
25
+ * // Batch OCR (multiple images)
26
+ * const texts = await model.ocrBatch([readFileSync('page1.jpg'), readFileSync('page2.jpg')]);
27
+ * ```
28
+ */
29
+ export { VLModel, createPaddleocrVlConfig } from '@mlx-node/core';
30
+ export { DocLayoutModel, type LayoutElement } from '@mlx-node/core';
31
+ export { TextDetModel, type TextBox, TextRecModel, type RecResult } from '@mlx-node/core';
32
+ export { DocOrientationModel, type OrientationResult, type ClassifyRotateResult, DocUnwarpModel, type UnwarpResult, } from '@mlx-node/core';
33
+ export { StructureV3Pipeline, type StructureV3Config, type AnalyzeOptions, type StructuredElement, type StructuredDocument, type TextLine, } from './pipeline/structure-v3.js';
34
+ export type { VisionConfig, TextConfig, ModelConfig, VlmChatConfig, VlmChatMessage, VlmBatchItem, } from '@mlx-node/core';
35
+ export { PADDLEOCR_VL_CONFIGS, type PaddleOCRVLConfig } from './models/paddleocr-vl-configs.js';
36
+ export { VlmChatResult, type VLMChatResult } from '@mlx-node/core';
37
+ export { parsePaddleResponse, parseVlmOutput, formatDocument, type ParsedDocument, type DocumentElement, type Table, type TableRow, type TableCell, type Paragraph, type ParserConfig, OutputFormat, } from '@mlx-node/core';
38
+ export { documentToXlsx, saveToXlsx } from '@mlx-node/core';
39
+ export { Qwen3Tokenizer as Tokenizer } from '@mlx-node/lm';
40
+ export { MxArray, type DType } from '@mlx-node/core';
41
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;GA2BG;AAKH,OAAO,EAAE,OAAO,EAAE,uBAAuB,EAAE,MAAM,gBAAgB,CAAC;AAGlE,OAAO,EAAE,cAAc,EAAE,KAAK,aAAa,EAAE,MAAM,gBAAgB,CAAC;AAGpE,OAAO,EAAE,YAAY,EAAE,KAAK,OAAO,EAAE,YAAY,EAAE,KAAK,SAAS,EAAE,MAAM,gBAAgB,CAAC;AAG1F,OAAO,EACL,mBAAmB,EACnB,KAAK,iBAAiB,EACtB,KAAK,oBAAoB,EACzB,cAAc,EACd,KAAK,YAAY,GAClB,MAAM,gBAAgB,CAAC;AAGxB,OAAO,EACL,mBAAmB,EACnB,KAAK,iBAAiB,EACtB,KAAK,cAAc,EACnB,KAAK,iBAAiB,EACtB,KAAK,kBAAkB,EACvB,KAAK,QAAQ,GACd,MAAM,4BAA4B,CAAC;AAGpC,YAAY,EACV,YAAY,EACZ,UAAU,EACV,WAAW,EACX,aAAa,EACb,cAAc,EACd,YAAY,GACb,MAAM,gBAAgB,CAAC;AAGxB,OAAO,EAAE,oBAAoB,EAAE,KAAK,iBAAiB,EAAE,MAAM,kCAAkC,CAAC;AAGhG,OAAO,EAAE,aAAa,EAAE,KAAK,aAAa,EAAE,MAAM,gBAAgB,CAAC;AAGnE,OAAO,EACL,mBAAmB,EACnB,cAAc,EACd,cAAc,EACd,KAAK,cAAc,EACnB,KAAK,eAAe,EACpB,KAAK,KAAK,EACV,KAAK,QAAQ,EACb,KAAK,SAAS,EACd,KAAK,SAAS,EACd,KAAK,YAAY,EACjB,YAAY,GACb,MAAM,gBAAgB,CAAC;AAGxB,OAAO,EAAE,cAAc,EAAE,UAAU,EAAE,MAAM,gBAAgB,CAAC;AAG5D,OAAO,EAAE,cAAc,IAAI,SAAS,EAAE,MAAM,cAAc,CAAC;AAC3D,OAAO,EAAE,OAAO,EAAE,KAAK,KAAK,EAAE,MAAM,gBAAgB,CAAC"}
package/dist/index.js ADDED
@@ -0,0 +1,50 @@
1
+ /**
2
+ * @mlx-node/vlm - Vision Language Model support for MLX-Node
3
+ *
4
+ * This package provides VLM capabilities including:
5
+ * - VLModel for OCR and document understanding tasks
6
+ *
7
+ * @example
8
+ * ```typescript
9
+ * import { VLModel } from '@mlx-node/vlm';
10
+ *
11
+ * // Load a model
12
+ * const model = await VLModel.load('./models/paddleocr-vl');
13
+ *
14
+ * // Chat with images
15
+ * const imageBuffer = readFileSync('./photo.jpg');
16
+ * const result = await model.chat(
17
+ * [{ role: 'user', content: 'What is in this image?' }],
18
+ * { images: [imageBuffer] }
19
+ * );
20
+ * console.log(result.text);
21
+ *
22
+ * // Simple OCR
23
+ * const text = await model.ocr(readFileSync('./document.jpg'));
24
+ *
25
+ * // Batch OCR (multiple images)
26
+ * const texts = await model.ocrBatch([readFileSync('page1.jpg'), readFileSync('page2.jpg')]);
27
+ * ```
28
+ */
29
+ // ============== PUBLIC API ==============
30
+ // Main model class and factory functions (exposed directly from Rust)
31
+ export { VLModel, createPaddleocrVlConfig } from '@mlx-node/core';
32
+ // Document layout analysis model
33
+ export { DocLayoutModel } from '@mlx-node/core';
34
+ // Text detection and recognition models (PP-OCRv5)
35
+ export { TextDetModel, TextRecModel } from '@mlx-node/core';
36
+ // Document preprocessing models
37
+ export { DocOrientationModel, DocUnwarpModel, } from '@mlx-node/core';
38
+ // Document understanding pipeline (PP-StructureV3)
39
+ export { StructureV3Pipeline, } from './pipeline/structure-v3.js';
40
+ // Model-specific configs
41
+ export { PADDLEOCR_VL_CONFIGS } from './models/paddleocr-vl-configs.js';
42
+ // Chat result type
43
+ export { VlmChatResult } from '@mlx-node/core';
44
+ // Output parsing and formatting (Rust implementation)
45
+ export { parsePaddleResponse, parseVlmOutput, formatDocument, OutputFormat, } from '@mlx-node/core';
46
+ // XLSX export (Rust implementation)
47
+ export { documentToXlsx, saveToXlsx } from '@mlx-node/core';
48
+ // Re-export shared utilities
49
+ export { Qwen3Tokenizer as Tokenizer } from '@mlx-node/lm';
50
+ export { MxArray } from '@mlx-node/core';
@@ -0,0 +1,5 @@
1
+ /**
2
+ * Model exports
3
+ */
4
+ export { PADDLEOCR_VL_CONFIGS, type PaddleOCRVLConfig } from './paddleocr-vl-configs.js';
5
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/models/index.ts"],"names":[],"mappings":"AAAA;;GAEG;AACH,OAAO,EAAE,oBAAoB,EAAE,KAAK,iBAAiB,EAAE,MAAM,2BAA2B,CAAC"}
@@ -0,0 +1,4 @@
1
+ /**
2
+ * Model exports
3
+ */
4
+ export { PADDLEOCR_VL_CONFIGS } from './paddleocr-vl-configs.js';
@@ -0,0 +1,53 @@
1
+ /**
2
+ * PaddleOCR-VL Model Configurations
3
+ *
4
+ * Based on the PaddleOCR-VL-1.5 model architecture.
5
+ */
6
+ export interface VisionConfig {
7
+ modelType: 'paddleocr_vl';
8
+ hiddenSize: number;
9
+ intermediateSize: number;
10
+ numHiddenLayers: number;
11
+ numAttentionHeads: number;
12
+ numChannels: number;
13
+ imageSize: number;
14
+ patchSize: number;
15
+ hiddenAct: string;
16
+ layerNormEps: number;
17
+ attentionDropout: number;
18
+ spatialMergeSize: number;
19
+ }
20
+ export interface TextConfig {
21
+ modelType: 'paddleocr_vl';
22
+ hiddenSize: number;
23
+ numHiddenLayers: number;
24
+ intermediateSize: number;
25
+ numAttentionHeads: number;
26
+ rmsNormEps: number;
27
+ vocabSize: number;
28
+ numKeyValueHeads: number;
29
+ maxPositionEmbeddings: number;
30
+ ropeTheta: number;
31
+ ropeTraditional: boolean;
32
+ useBias: boolean;
33
+ headDim: number;
34
+ mropeSection: [number, number, number];
35
+ }
36
+ export interface PaddleOCRVLConfig {
37
+ visionConfig: VisionConfig;
38
+ textConfig: TextConfig;
39
+ modelType: 'paddleocr_vl';
40
+ ignoreIndex: number;
41
+ imageTokenId: number;
42
+ videoTokenId: number;
43
+ visionStartTokenId: number;
44
+ visionEndTokenId: number;
45
+ eosTokenId: number;
46
+ }
47
+ /**
48
+ * Available PaddleOCR-VL configurations
49
+ */
50
+ export declare const PADDLEOCR_VL_CONFIGS: {
51
+ readonly 'paddleocr-vl-1.5': PaddleOCRVLConfig;
52
+ };
53
+ //# sourceMappingURL=paddleocr-vl-configs.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"paddleocr-vl-configs.d.ts","sourceRoot":"","sources":["../../src/models/paddleocr-vl-configs.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,MAAM,WAAW,YAAY;IAC3B,SAAS,EAAE,cAAc,CAAC;IAC1B,UAAU,EAAE,MAAM,CAAC;IACnB,gBAAgB,EAAE,MAAM,CAAC;IACzB,eAAe,EAAE,MAAM,CAAC;IACxB,iBAAiB,EAAE,MAAM,CAAC;IAC1B,WAAW,EAAE,MAAM,CAAC;IACpB,SAAS,EAAE,MAAM,CAAC;IAClB,SAAS,EAAE,MAAM,CAAC;IAClB,SAAS,EAAE,MAAM,CAAC;IAClB,YAAY,EAAE,MAAM,CAAC;IACrB,gBAAgB,EAAE,MAAM,CAAC;IACzB,gBAAgB,EAAE,MAAM,CAAC;CAC1B;AAED,MAAM,WAAW,UAAU;IACzB,SAAS,EAAE,cAAc,CAAC;IAC1B,UAAU,EAAE,MAAM,CAAC;IACnB,eAAe,EAAE,MAAM,CAAC;IACxB,gBAAgB,EAAE,MAAM,CAAC;IACzB,iBAAiB,EAAE,MAAM,CAAC;IAC1B,UAAU,EAAE,MAAM,CAAC;IACnB,SAAS,EAAE,MAAM,CAAC;IAClB,gBAAgB,EAAE,MAAM,CAAC;IACzB,qBAAqB,EAAE,MAAM,CAAC;IAC9B,SAAS,EAAE,MAAM,CAAC;IAClB,eAAe,EAAE,OAAO,CAAC;IACzB,OAAO,EAAE,OAAO,CAAC;IACjB,OAAO,EAAE,MAAM,CAAC;IAChB,YAAY,EAAE,CAAC,MAAM,EAAE,MAAM,EAAE,MAAM,CAAC,CAAC;CACxC;AAED,MAAM,WAAW,iBAAiB;IAChC,YAAY,EAAE,YAAY,CAAC;IAC3B,UAAU,EAAE,UAAU,CAAC;IACvB,SAAS,EAAE,cAAc,CAAC;IAC1B,WAAW,EAAE,MAAM,CAAC;IACpB,YAAY,EAAE,MAAM,CAAC;IACrB,YAAY,EAAE,MAAM,CAAC;IACrB,kBAAkB,EAAE,MAAM,CAAC;IAC3B,gBAAgB,EAAE,MAAM,CAAC;IACzB,UAAU,EAAE,MAAM,CAAC;CACpB;AA6CD;;GAEG;AACH,eAAO,MAAM,oBAAoB;;CAEvB,CAAC"}
@@ -0,0 +1,53 @@
1
+ /**
2
+ * PaddleOCR-VL Model Configurations
3
+ *
4
+ * Based on the PaddleOCR-VL-1.5 model architecture.
5
+ */
6
+ /**
7
+ * PaddleOCR-VL-1.5 default configuration
8
+ */
9
+ const PADDLEOCR_VL_1_5_CONFIG = {
10
+ modelType: 'paddleocr_vl',
11
+ ignoreIndex: -100,
12
+ imageTokenId: 100295,
13
+ videoTokenId: 100296,
14
+ visionStartTokenId: 101305,
15
+ visionEndTokenId: 101306,
16
+ eosTokenId: 2,
17
+ visionConfig: {
18
+ modelType: 'paddleocr_vl',
19
+ hiddenSize: 1152,
20
+ intermediateSize: 4304,
21
+ numHiddenLayers: 27,
22
+ numAttentionHeads: 16,
23
+ numChannels: 3,
24
+ imageSize: 384,
25
+ patchSize: 14,
26
+ hiddenAct: 'gelu_pytorch_tanh',
27
+ layerNormEps: 1e-6,
28
+ attentionDropout: 0.0,
29
+ spatialMergeSize: 2,
30
+ },
31
+ textConfig: {
32
+ modelType: 'paddleocr_vl',
33
+ hiddenSize: 1024,
34
+ numHiddenLayers: 18,
35
+ intermediateSize: 3072,
36
+ numAttentionHeads: 16,
37
+ rmsNormEps: 1e-5,
38
+ vocabSize: 103424,
39
+ numKeyValueHeads: 2,
40
+ maxPositionEmbeddings: 131072,
41
+ ropeTheta: 500000.0,
42
+ ropeTraditional: false,
43
+ useBias: false,
44
+ headDim: 128,
45
+ mropeSection: [16, 24, 24],
46
+ },
47
+ };
48
+ /**
49
+ * Available PaddleOCR-VL configurations
50
+ */
51
+ export const PADDLEOCR_VL_CONFIGS = {
52
+ 'paddleocr-vl-1.5': PADDLEOCR_VL_1_5_CONFIG,
53
+ };
@@ -0,0 +1,130 @@
1
+ /**
2
+ * PP-StructureV3 Document Understanding Pipeline
3
+ *
4
+ * Combines PP-DocLayoutV3 (layout detection) with PP-OCRv5 (text detection + recognition)
5
+ * for fast, accurate document understanding without a VLM.
6
+ *
7
+ * Pipeline:
8
+ * 1. DocLayoutModel detects layout elements (titles, text, tables, figures...)
9
+ * 2. For text/title/list elements: TextDetModel detects text lines → TextRecModel recognizes text
10
+ * 3. For table/formula/chart: VLM fallback (optional)
11
+ * 4. Results assembled into structured markdown in reading order
12
+ *
13
+ * @example
14
+ * ```typescript
15
+ * import { StructureV3Pipeline } from '@mlx-node/vlm';
16
+ *
17
+ * const pipeline = StructureV3Pipeline.load({
18
+ * layoutModelPath: './models/PP-DocLayoutV3',
19
+ * textDetModelPath: './models/PP-OCRv5_server_det',
20
+ * textRecModelPath: './models/PP-OCRv5_server_rec',
21
+ * dictPath: './models/PP-OCRv5_server_rec/ppocr_keys_v1.txt',
22
+ * });
23
+ *
24
+ * const result = pipeline.analyze('./document.png');
25
+ * console.log(result.markdown);
26
+ * ```
27
+ */
28
+ /** Configuration for loading the StructureV3 pipeline. */
29
+ export interface StructureV3Config {
30
+ /** Path to PP-DocLayoutV3 model directory */
31
+ layoutModelPath: string;
32
+ /** Path to PP-OCRv5 text detection model directory */
33
+ textDetModelPath: string;
34
+ /** Path to PP-OCRv5 text recognition model directory */
35
+ textRecModelPath: string;
36
+ /** Path to character dictionary file (e.g., ppocr_keys_v1.txt) */
37
+ dictPath: string;
38
+ /** Path to doc orientation classification model directory (optional) */
39
+ docOrientationModelPath?: string;
40
+ /** Path to doc unwarping model directory (optional) */
41
+ docUnwarpModelPath?: string;
42
+ }
43
+ /** Options for document analysis. */
44
+ export interface AnalyzeOptions {
45
+ /** Layout detection confidence threshold (default: 0.5) */
46
+ layoutThreshold?: number;
47
+ /** Text detection confidence threshold (default: 0.3) */
48
+ textDetThreshold?: number;
49
+ /** Whether to include element-level details in output (default: false) */
50
+ includeDetails?: boolean;
51
+ /** Whether to run document orientation classification (default: true if model loaded) */
52
+ useDocOrientationClassify?: boolean;
53
+ /** Whether to run document unwarping (default: true if model loaded) */
54
+ useDocUnwarping?: boolean;
55
+ }
56
+ /** A recognized text line within a layout element. */
57
+ export interface TextLine {
58
+ /** Bounding box [x1, y1, x2, y2] relative to the element crop */
59
+ bbox: number[];
60
+ /** Recognized text */
61
+ text: string;
62
+ /** Recognition confidence */
63
+ score: number;
64
+ }
65
+ /** A structured document element with recognized content. */
66
+ export interface StructuredElement {
67
+ /** Element type from layout detection */
68
+ label: string;
69
+ /** Detection confidence */
70
+ score: number;
71
+ /** Bounding box [x1, y1, x2, y2] in original image coordinates */
72
+ bbox: number[];
73
+ /** Reading order index */
74
+ order: number;
75
+ /** Recognized text content */
76
+ text: string;
77
+ /** Individual text lines (if includeDetails is true) */
78
+ lines?: TextLine[];
79
+ }
80
+ /** Result of document analysis. */
81
+ export interface StructuredDocument {
82
+ /** Structured elements in reading order */
83
+ elements: StructuredElement[];
84
+ /** Assembled markdown output */
85
+ markdown: string;
86
+ }
87
+ /**
88
+ * PP-StructureV3 document understanding pipeline.
89
+ *
90
+ * Uses dedicated OCR models (TextDet + TextRec) instead of a VLM,
91
+ * providing ~4-5x faster text extraction with ~6x lower memory usage.
92
+ */
93
+ export declare class StructureV3Pipeline {
94
+ private layout;
95
+ private textDet;
96
+ private textRec;
97
+ private docOrientation;
98
+ private docUnwarp;
99
+ private constructor();
100
+ /**
101
+ * Load all models and create the pipeline.
102
+ */
103
+ static load(config: StructureV3Config): StructureV3Pipeline;
104
+ /**
105
+ * Analyze a document image and extract structured content.
106
+ *
107
+ * @param imageData - Buffer with encoded image bytes, or a file path string
108
+ * @param options - Analysis options
109
+ * @returns Structured document with elements and markdown
110
+ */
111
+ analyze(imageData: Buffer | string, options?: AnalyzeOptions): Promise<StructuredDocument>;
112
+ /**
113
+ * Run text detection + recognition on a single image (no layout detection).
114
+ *
115
+ * Useful for processing pre-cropped text regions.
116
+ */
117
+ ocrImage(imageData: Buffer, textDetThreshold?: number): Promise<TextLine[]>;
118
+ /**
119
+ * Detect text lines and recognize text in a cropped region.
120
+ *
121
+ * For each detected text line bounding box, sub-crops that line from the
122
+ * crop image and passes the individual line image to text recognition.
123
+ */
124
+ private ocrRegion;
125
+ /**
126
+ * Crop a layout element from the source image and return PNG bytes.
127
+ */
128
+ private cropElement;
129
+ }
130
+ //# sourceMappingURL=structure-v3.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"structure-v3.d.ts","sourceRoot":"","sources":["../../src/pipeline/structure-v3.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;GA0BG;AAgBH,0DAA0D;AAC1D,MAAM,WAAW,iBAAiB;IAChC,6CAA6C;IAC7C,eAAe,EAAE,MAAM,CAAC;IACxB,sDAAsD;IACtD,gBAAgB,EAAE,MAAM,CAAC;IACzB,wDAAwD;IACxD,gBAAgB,EAAE,MAAM,CAAC;IACzB,kEAAkE;IAClE,QAAQ,EAAE,MAAM,CAAC;IACjB,wEAAwE;IACxE,uBAAuB,CAAC,EAAE,MAAM,CAAC;IACjC,uDAAuD;IACvD,kBAAkB,CAAC,EAAE,MAAM,CAAC;CAC7B;AAED,qCAAqC;AACrC,MAAM,WAAW,cAAc;IAC7B,2DAA2D;IAC3D,eAAe,CAAC,EAAE,MAAM,CAAC;IACzB,yDAAyD;IACzD,gBAAgB,CAAC,EAAE,MAAM,CAAC;IAC1B,0EAA0E;IAC1E,cAAc,CAAC,EAAE,OAAO,CAAC;IACzB,yFAAyF;IACzF,yBAAyB,CAAC,EAAE,OAAO,CAAC;IACpC,wEAAwE;IACxE,eAAe,CAAC,EAAE,OAAO,CAAC;CAC3B;AAED,sDAAsD;AACtD,MAAM,WAAW,QAAQ;IACvB,iEAAiE;IACjE,IAAI,EAAE,MAAM,EAAE,CAAC;IACf,sBAAsB;IACtB,IAAI,EAAE,MAAM,CAAC;IACb,6BAA6B;IAC7B,KAAK,EAAE,MAAM,CAAC;CACf;AAED,6DAA6D;AAC7D,MAAM,WAAW,iBAAiB;IAChC,yCAAyC;IACzC,KAAK,EAAE,MAAM,CAAC;IACd,2BAA2B;IAC3B,KAAK,EAAE,MAAM,CAAC;IACd,kEAAkE;IAClE,IAAI,EAAE,MAAM,EAAE,CAAC;IACf,0BAA0B;IAC1B,KAAK,EAAE,MAAM,CAAC;IACd,8BAA8B;IAC9B,IAAI,EAAE,MAAM,CAAC;IACb,wDAAwD;IACxD,KAAK,CAAC,EAAE,QAAQ,EAAE,CAAC;CACpB;AAED,mCAAmC;AACnC,MAAM,WAAW,kBAAkB;IACjC,2CAA2C;IAC3C,QAAQ,EAAE,iBAAiB,EAAE,CAAC;IAC9B,gCAAgC;IAChC,QAAQ,EAAE,MAAM,CAAC;CAClB;AAqCD;;;;;GAKG;AACH,qBAAa,mBAAmB;IAC9B,OAAO,CAAC,MAAM,CAAiB;IAC/B,OAAO,CAAC,OAAO,CAAe;IAC9B,OAAO,CAAC,OAAO,CAAe;IAC9B,OAAO,CAAC,cAAc,CAA6B;IACnD,OAAO,CAAC,SAAS,CAAwB;IAEzC,OAAO;IAcP;;OAEG;IACH,MAAM,CAAC,IAAI,CAAC,MAAM,EAAE,iBAAiB,GAAG,mBAAmB;IAa3D;;;;;;OAMG;IACG,OAAO,CAAC,SAAS,EAAE,MAAM,GAAG,MAAM,EAAE,OAAO,GAAE,cAAmB,GAAG,OAAO,CAAC,kBAAkB,CAAC;IAgGpG;;;;OAIG;IACG,QAAQ,CAAC,SAAS,EAAE,MAAM,EAAE,gBAAgB,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,QAAQ,EAAE,CAAC;IAIjF;;;;;OAKG;YACW,SAAS;IAwDvB;;OAEG;YACW,WAAW;CAW1B"}
@@ -0,0 +1,314 @@
1
+ /**
2
+ * PP-StructureV3 Document Understanding Pipeline
3
+ *
4
+ * Combines PP-DocLayoutV3 (layout detection) with PP-OCRv5 (text detection + recognition)
5
+ * for fast, accurate document understanding without a VLM.
6
+ *
7
+ * Pipeline:
8
+ * 1. DocLayoutModel detects layout elements (titles, text, tables, figures...)
9
+ * 2. For text/title/list elements: TextDetModel detects text lines → TextRecModel recognizes text
10
+ * 3. For table/formula/chart: VLM fallback (optional)
11
+ * 4. Results assembled into structured markdown in reading order
12
+ *
13
+ * @example
14
+ * ```typescript
15
+ * import { StructureV3Pipeline } from '@mlx-node/vlm';
16
+ *
17
+ * const pipeline = StructureV3Pipeline.load({
18
+ * layoutModelPath: './models/PP-DocLayoutV3',
19
+ * textDetModelPath: './models/PP-OCRv5_server_det',
20
+ * textRecModelPath: './models/PP-OCRv5_server_rec',
21
+ * dictPath: './models/PP-OCRv5_server_rec/ppocr_keys_v1.txt',
22
+ * });
23
+ *
24
+ * const result = pipeline.analyze('./document.png');
25
+ * console.log(result.markdown);
26
+ * ```
27
+ */
28
+ import { readFileSync } from 'node:fs';
29
+ import { DocLayoutModel, TextDetModel, TextRecModel, DocOrientationModel, DocUnwarpModel, } from '@mlx-node/core';
30
+ // ============================================================================
31
+ // Element type sets
32
+ // ============================================================================
33
+ /** Elements that contain text and should be processed with OCR */
34
+ const TEXT_LABELS = new Set([
35
+ 'title',
36
+ 'doc_title',
37
+ 'paragraph_title',
38
+ 'text',
39
+ 'abstract',
40
+ 'list',
41
+ 'table_caption',
42
+ 'table_footnote',
43
+ 'figure_caption',
44
+ 'chart_caption',
45
+ 'formula_caption',
46
+ 'code_txt',
47
+ 'header',
48
+ 'footer',
49
+ 'footnote',
50
+ 'margin_note',
51
+ 'reference',
52
+ 'content',
53
+ 'index',
54
+ 'handwriting',
55
+ ]);
56
+ /** Elements that are non-content (skipped) */
57
+ const SKIP_LABELS = new Set(['abandon']);
58
+ // ============================================================================
59
+ // Pipeline
60
+ // ============================================================================
61
+ /**
62
+ * PP-StructureV3 document understanding pipeline.
63
+ *
64
+ * Uses dedicated OCR models (TextDet + TextRec) instead of a VLM,
65
+ * providing ~4-5x faster text extraction with ~6x lower memory usage.
66
+ */
67
+ export class StructureV3Pipeline {
68
+ layout;
69
+ textDet;
70
+ textRec;
71
+ docOrientation;
72
+ docUnwarp;
73
+ constructor(layout, textDet, textRec, docOrientation, docUnwarp) {
74
+ this.layout = layout;
75
+ this.textDet = textDet;
76
+ this.textRec = textRec;
77
+ this.docOrientation = docOrientation;
78
+ this.docUnwarp = docUnwarp;
79
+ }
80
+ /**
81
+ * Load all models and create the pipeline.
82
+ */
83
+ static load(config) {
84
+ const layout = DocLayoutModel.load(config.layoutModelPath);
85
+ const textDet = TextDetModel.load(config.textDetModelPath);
86
+ const textRec = TextRecModel.load(config.textRecModelPath, config.dictPath);
87
+ const docOrientation = config.docOrientationModelPath
88
+ ? DocOrientationModel.load(config.docOrientationModelPath)
89
+ : null;
90
+ const docUnwarp = config.docUnwarpModelPath ? DocUnwarpModel.load(config.docUnwarpModelPath) : null;
91
+ return new StructureV3Pipeline(layout, textDet, textRec, docOrientation, docUnwarp);
92
+ }
93
+ /**
94
+ * Analyze a document image and extract structured content.
95
+ *
96
+ * @param imageData - Buffer with encoded image bytes, or a file path string
97
+ * @param options - Analysis options
98
+ * @returns Structured document with elements and markdown
99
+ */
100
+ async analyze(imageData, options = {}) {
101
+ const { layoutThreshold = 0.5, textDetThreshold, includeDetails = false } = options;
102
+ let imageBuffer = typeof imageData === 'string' ? readFileSync(imageData) : imageData;
103
+ // Step 0a: Document orientation correction
104
+ if (this.docOrientation && (options.useDocOrientationClassify ?? true)) {
105
+ const rotateResult = this.docOrientation.classifyAndRotate(imageBuffer);
106
+ if (rotateResult.angle !== 0) {
107
+ imageBuffer = Buffer.from(rotateResult.image);
108
+ }
109
+ }
110
+ // Step 0b: Document unwarping
111
+ if (this.docUnwarp && (options.useDocUnwarping ?? true)) {
112
+ const unwarpResult = this.docUnwarp.unwarp(imageBuffer);
113
+ imageBuffer = Buffer.from(unwarpResult.image);
114
+ }
115
+ // Step 1: Layout detection (on preprocessed image)
116
+ const layoutElements = this.layout.detect(imageBuffer, layoutThreshold);
117
+ if (layoutElements.length === 0) {
118
+ return { elements: [], markdown: '' };
119
+ }
120
+ // Step 2: Process each element
121
+ const elements = [];
122
+ for (const el of layoutElements) {
123
+ const label = el.labelName;
124
+ if (SKIP_LABELS.has(label)) {
125
+ continue;
126
+ }
127
+ if (TEXT_LABELS.has(label)) {
128
+ // OCR path: detect text lines then recognize
129
+ const cropBuffer = await this.cropElement(imageBuffer, el);
130
+ const textLines = await this.ocrRegion(cropBuffer, textDetThreshold);
131
+ const fullText = textLines.map((l) => l.text).join('\n');
132
+ elements.push({
133
+ label,
134
+ score: el.score,
135
+ bbox: el.bbox,
136
+ order: el.order,
137
+ text: fullText,
138
+ lines: includeDetails ? textLines : undefined,
139
+ });
140
+ }
141
+ else if (label === 'table') {
142
+ // Table: detect text lines in each cell (simplified — no cell-level structure yet)
143
+ const cropBuffer = await this.cropElement(imageBuffer, el);
144
+ const textLines = await this.ocrRegion(cropBuffer, textDetThreshold);
145
+ const fullText = textLines.map((l) => l.text).join('\n');
146
+ elements.push({
147
+ label,
148
+ score: el.score,
149
+ bbox: el.bbox,
150
+ order: el.order,
151
+ text: fullText,
152
+ lines: includeDetails ? textLines : undefined,
153
+ });
154
+ }
155
+ else if (label === 'isolate_formula') {
156
+ // Formula: basic OCR (no LaTeX recognition yet)
157
+ const cropBuffer = await this.cropElement(imageBuffer, el);
158
+ const textLines = await this.ocrRegion(cropBuffer, textDetThreshold);
159
+ const fullText = textLines.map((l) => l.text).join(' ');
160
+ elements.push({
161
+ label,
162
+ score: el.score,
163
+ bbox: el.bbox,
164
+ order: el.order,
165
+ text: fullText,
166
+ });
167
+ }
168
+ else {
169
+ // figure, chart, seal, etc. - placeholder
170
+ elements.push({
171
+ label,
172
+ score: el.score,
173
+ bbox: el.bbox,
174
+ order: el.order,
175
+ text: '',
176
+ });
177
+ }
178
+ }
179
+ // Step 3: Assemble markdown
180
+ const markdown = assembleMarkdown(elements);
181
+ return { elements, markdown };
182
+ }
183
+ /**
184
+ * Run text detection + recognition on a single image (no layout detection).
185
+ *
186
+ * Useful for processing pre-cropped text regions.
187
+ */
188
+ async ocrImage(imageData, textDetThreshold) {
189
+ return this.ocrRegion(imageData, textDetThreshold);
190
+ }
191
+ /**
192
+ * Detect text lines and recognize text in a cropped region.
193
+ *
194
+ * For each detected text line bounding box, sub-crops that line from the
195
+ * crop image and passes the individual line image to text recognition.
196
+ */
197
+ async ocrRegion(imageData, textDetThreshold) {
198
+ // Detect text lines within the crop
199
+ const textBoxes = this.textDet.detect(imageData, textDetThreshold);
200
+ if (textBoxes.length === 0) {
201
+ // Fall back to recognizing the entire crop as one text line
202
+ const result = this.textRec.recognize(imageData);
203
+ if (result.text.trim()) {
204
+ return [{ bbox: [0, 0, 0, 0], text: result.text, score: result.score }];
205
+ }
206
+ return [];
207
+ }
208
+ // Sort text boxes by vertical position (top to bottom, left to right)
209
+ const sorted = [...textBoxes].sort((a, b) => {
210
+ const yDiff = a.bbox[1] - b.bbox[1];
211
+ if (Math.abs(yDiff) > 10)
212
+ return yDiff;
213
+ return a.bbox[0] - b.bbox[0];
214
+ });
215
+ // Single detected line: recognize the full crop directly (no sub-crop needed)
216
+ if (sorted.length === 1) {
217
+ const result = this.textRec.recognize(imageData);
218
+ return [
219
+ {
220
+ bbox: sorted[0].bbox,
221
+ text: result.text,
222
+ score: result.score,
223
+ },
224
+ ];
225
+ }
226
+ // Multiple detected lines: sub-crop each line from the crop image
227
+ const { Transformer } = await import('@napi-rs/image');
228
+ const lineBuffers = [];
229
+ for (let i = 0; i < sorted.length; i++) {
230
+ const [x1, y1, x2, y2] = sorted[i].bbox;
231
+ const x = Math.max(0, Math.round(x1));
232
+ const y = Math.max(0, Math.round(y1));
233
+ const w = Math.max(1, Math.round(x2 - x1));
234
+ const h = Math.max(1, Math.round(y2 - y1));
235
+ const linePng = await new Transformer(imageData).crop(x, y, w, h).png();
236
+ lineBuffers.push(Buffer.from(linePng));
237
+ }
238
+ const results = this.textRec.recognizeBatch(lineBuffers);
239
+ return sorted.map((box, i) => ({
240
+ bbox: box.bbox,
241
+ text: results[i]?.text ?? '',
242
+ score: results[i]?.score ?? 0,
243
+ }));
244
+ }
245
+ /**
246
+ * Crop a layout element from the source image and return PNG bytes.
247
+ */
248
+ async cropElement(imageBuffer, el) {
249
+ const [x1, y1, x2, y2] = el.bbox;
250
+ const x = Math.max(0, Math.round(x1));
251
+ const y = Math.max(0, Math.round(y1));
252
+ const w = Math.max(1, Math.round(x2 - x1));
253
+ const h = Math.max(1, Math.round(y2 - y1));
254
+ const { Transformer } = await import('@napi-rs/image');
255
+ const cropped = await new Transformer(imageBuffer).crop(x, y, w, h).png();
256
+ return Buffer.from(cropped);
257
+ }
258
+ }
259
+ // ============================================================================
260
+ // Markdown assembly
261
+ // ============================================================================
262
+ /** Format a single element as markdown. */
263
+ function formatElement(label, text, order) {
264
+ const trimmed = text.trim();
265
+ if (!trimmed)
266
+ return '';
267
+ switch (label) {
268
+ case 'doc_title':
269
+ return `# ${trimmed}\n`;
270
+ case 'title':
271
+ return `## ${trimmed}\n`;
272
+ case 'paragraph_title':
273
+ return `### ${trimmed}\n`;
274
+ case 'abstract':
275
+ return `> ${trimmed}\n`;
276
+ case 'table':
277
+ return `${trimmed}\n`;
278
+ case 'table_caption':
279
+ case 'figure_caption':
280
+ case 'chart_caption':
281
+ case 'formula_caption':
282
+ return `*${trimmed}*\n`;
283
+ case 'isolate_formula':
284
+ return `$$\n${trimmed}\n$$\n`;
285
+ case 'code_txt':
286
+ return `\`\`\`\n${trimmed}\n\`\`\`\n`;
287
+ case 'figure':
288
+ case 'chart':
289
+ return trimmed ? `[${label}: ${trimmed}]\n` : `[${label}]\n`;
290
+ case 'header':
291
+ case 'footer':
292
+ return `<!-- ${label}: ${trimmed} -->\n`;
293
+ case 'footnote':
294
+ case 'table_footnote':
295
+ return `[^note-${order}]: ${trimmed}\n`;
296
+ case 'list':
297
+ return `${trimmed}\n`;
298
+ case 'seal':
299
+ return `[seal: ${trimmed}]\n`;
300
+ default:
301
+ return `${trimmed}\n`;
302
+ }
303
+ }
304
+ /** Assemble structured elements into markdown. */
305
+ function assembleMarkdown(elements) {
306
+ const parts = [];
307
+ for (const el of elements) {
308
+ const formatted = formatElement(el.label, el.text, el.order);
309
+ if (formatted) {
310
+ parts.push(formatted);
311
+ }
312
+ }
313
+ return parts.join('\n');
314
+ }
package/package.json ADDED
@@ -0,0 +1,35 @@
1
+ {
2
+ "name": "@mlx-node/vlm",
3
+ "version": "0.0.0",
4
+ "homepage": "https://github.com/mlx-node/mlx-node",
5
+ "bugs": {
6
+ "url": "https://github.com/mlx-node/mlx-node/issues"
7
+ },
8
+ "license": "MIT",
9
+ "repository": {
10
+ "type": "git",
11
+ "url": "https://github.com/mlx-node/mlx-node.git",
12
+ "directory": "packages/vlm"
13
+ },
14
+ "files": [
15
+ "dist"
16
+ ],
17
+ "type": "module",
18
+ "main": "./dist/index.js",
19
+ "types": "./dist/index.d.ts",
20
+ "exports": {
21
+ ".": {
22
+ "types": "./dist/index.d.ts",
23
+ "import": "./dist/index.js"
24
+ }
25
+ },
26
+ "scripts": {
27
+ "build": "tsc -b",
28
+ "test": "vite test run"
29
+ },
30
+ "dependencies": {
31
+ "@mlx-node/core": "workspace:*",
32
+ "@mlx-node/lm": "workspace:*",
33
+ "@napi-rs/image": "^1.12.0"
34
+ }
35
+ }