macos-vision 1.2.0 → 1.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +168 -103
- package/bin/pdf-helper +0 -0
- package/bin/vision-helper +0 -0
- package/dist/cli.js +131 -68
- package/dist/index.d.ts +2 -0
- package/dist/index.js +5 -3
- package/dist/markdown/chunker.d.ts +11 -0
- package/dist/markdown/chunker.js +39 -0
- package/dist/markdown/index.d.ts +61 -0
- package/dist/markdown/index.js +92 -0
- package/dist/markdown/ollama.d.ts +21 -0
- package/dist/markdown/ollama.js +50 -0
- package/dist/markdown/prompt.d.ts +35 -0
- package/dist/markdown/prompt.js +82 -0
- package/package.json +30 -5
- package/src/native/pdf-helper.swift +122 -0
- package/src/native/vision-helper.swift +241 -0
- package/.husky/commit-msg +0 -2
- package/.husky/pre-commit +0 -3
- package/.prettierignore +0 -4
- package/.prettierrc.json +0 -7
- package/.release-it.json +0 -20
- package/CHANGELOG.md +0 -44
- package/commitlint.config.js +0 -1
- package/debug.js +0 -37
- package/eslint.config.js +0 -21
package/README.md
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# macos-vision
|
|
2
2
|
|
|
3
|
-
> Apple Vision for Node.js — native, fast, offline
|
|
3
|
+
> Apple Vision for Node.js — native, fast, offline. Now with an optional Ollama-driven Markdown pipeline.
|
|
4
4
|
|
|
5
5
|
Uses macOS's built-in [Vision framework](https://developer.apple.com/documentation/vision) via a compiled Swift binary. Works completely offline. No cloud services, no API keys, no Python, zero runtime dependencies.
|
|
6
6
|
|
|
@@ -8,11 +8,8 @@ Uses macOS's built-in [Vision framework](https://developer.apple.com/documentati
|
|
|
8
8
|
|
|
9
9
|
- macOS 12+
|
|
10
10
|
- Node.js 18+
|
|
11
|
-
- Xcode Command Line Tools
|
|
12
|
-
|
|
13
|
-
```bash
|
|
14
|
-
xcode-select --install
|
|
15
|
-
```
|
|
11
|
+
- Xcode Command Line Tools (`xcode-select --install`)
|
|
12
|
+
- [Ollama](https://ollama.com) running locally — only if you use the Markdown pipeline
|
|
16
13
|
|
|
17
14
|
## Installation
|
|
18
15
|
|
|
@@ -20,18 +17,18 @@ xcode-select --install
|
|
|
20
17
|
npm install macos-vision
|
|
21
18
|
```
|
|
22
19
|
|
|
23
|
-
The native Swift
|
|
24
|
-
|
|
25
|
-
## What this is (and isn't)
|
|
26
|
-
|
|
27
|
-
`macos-vision` gives you **raw Apple Vision results** — text, coordinates, bounding boxes, labels.
|
|
20
|
+
The native Swift binaries (`vision-helper`, `pdf-helper`) are compiled automatically on install.
|
|
28
21
|
|
|
29
|
-
|
|
30
|
-
- Convert PDFs or images to Markdown
|
|
31
|
-
- Understand document structure (headings, tables, paragraphs)
|
|
32
|
-
- Chain multiple detections into a final report
|
|
22
|
+
## What you get
|
|
33
23
|
|
|
34
|
-
|
|
24
|
+
| Capability | Engine | Network |
|
|
25
|
+
|---|---|---|
|
|
26
|
+
| OCR (text + bounding boxes) | Apple Vision | offline |
|
|
27
|
+
| Face / barcode / rectangle / document detection | Apple Vision | offline |
|
|
28
|
+
| Image classification | Apple Vision | offline |
|
|
29
|
+
| Layout inference (lines, paragraphs, reading order) | heuristic in TypeScript | offline |
|
|
30
|
+
| PDF rasterization | PDFKit (`pdf-helper`) | offline |
|
|
31
|
+
| **Image / PDF → Markdown** | Apple Vision OCR + local LLM via Ollama | local LLM call |
|
|
35
32
|
|
|
36
33
|
---
|
|
37
34
|
|
|
@@ -44,60 +41,71 @@ npx macos-vision photo.jpg
|
|
|
44
41
|
# Structured OCR blocks with bounding boxes
|
|
45
42
|
npx macos-vision --blocks photo.jpg
|
|
46
43
|
|
|
47
|
-
#
|
|
44
|
+
# Detections
|
|
48
45
|
npx macos-vision --faces photo.jpg
|
|
49
|
-
|
|
50
|
-
# Detect barcodes and QR codes
|
|
51
46
|
npx macos-vision --barcodes photo.jpg
|
|
52
|
-
|
|
53
|
-
# Detect rectangular shapes
|
|
54
47
|
npx macos-vision --rectangles photo.jpg
|
|
55
|
-
|
|
56
|
-
# Find document boundary
|
|
57
48
|
npx macos-vision --document photo.jpg
|
|
58
|
-
|
|
59
|
-
# Classify image content
|
|
60
49
|
npx macos-vision --classify photo.jpg
|
|
61
50
|
|
|
62
51
|
# Run all detections at once
|
|
63
52
|
npx macos-vision --all photo.jpg
|
|
53
|
+
|
|
54
|
+
# Image / PDF → Markdown via VisionScribe + Ollama
|
|
55
|
+
npx macos-vision --markdown invoice.pdf -o notes.md
|
|
56
|
+
npx macos-vision --markdown receipt.jpg --stdout
|
|
57
|
+
npx macos-vision --markdown scan.png --model llama3.2
|
|
64
58
|
```
|
|
65
59
|
|
|
66
|
-
Multiple flags can be combined: `npx macos-vision --blocks --faces --classify photo.jpg
|
|
60
|
+
Multiple Vision flags can be combined: `npx macos-vision --blocks --faces --classify photo.jpg`. Structured results are printed as JSON to stdout.
|
|
67
61
|
|
|
68
|
-
|
|
62
|
+
### CLI flags
|
|
63
|
+
|
|
64
|
+
| Flag | Description |
|
|
65
|
+
|---|---|
|
|
66
|
+
| `--ocr` | Plain text OCR (default when no flag is given) |
|
|
67
|
+
| `--blocks` | OCR with bounding boxes (JSON) |
|
|
68
|
+
| `--faces` / `--barcodes` / `--rectangles` / `--document` / `--classify` | Vision detections (JSON) |
|
|
69
|
+
| `--all` | Run every Vision detection at once |
|
|
70
|
+
| `--markdown` | Convert image / PDF to Markdown via VisionScribe + Ollama |
|
|
71
|
+
| `--model <name>` | Ollama model (default: `mistral-nemo`). Only used with `--markdown` |
|
|
72
|
+
| `--ollama-url <url>` | Ollama base URL (default: `http://localhost:11434`). Only used with `--markdown` |
|
|
73
|
+
| `-o`, `--output <path>` | Write Markdown to a file. Only used with `--markdown` |
|
|
74
|
+
| `--stdout` | Print Markdown to stdout instead of a file. Only used with `--markdown` |
|
|
75
|
+
| `--help` | Show usage |
|
|
69
76
|
|
|
70
77
|
---
|
|
71
78
|
|
|
72
|
-
## API
|
|
79
|
+
## API — Vision
|
|
73
80
|
|
|
74
81
|
```js
|
|
75
|
-
import {
|
|
82
|
+
import {
|
|
83
|
+
ocr,
|
|
84
|
+
detectFaces,
|
|
85
|
+
detectBarcodes,
|
|
86
|
+
detectRectangles,
|
|
87
|
+
detectDocument,
|
|
88
|
+
classify,
|
|
89
|
+
inferLayout,
|
|
90
|
+
} from 'macos-vision';
|
|
76
91
|
|
|
77
92
|
// OCR — plain text
|
|
78
|
-
const text = await ocr('photo.jpg')
|
|
93
|
+
const text = await ocr('photo.jpg');
|
|
79
94
|
|
|
80
95
|
// OCR — structured blocks with bounding boxes
|
|
81
|
-
const blocks = await ocr('photo.jpg', { format: 'blocks' })
|
|
82
|
-
|
|
83
|
-
// Detect faces
|
|
84
|
-
const faces = await detectFaces('photo.jpg')
|
|
85
|
-
|
|
86
|
-
// Detect barcodes and QR codes
|
|
87
|
-
const codes = await detectBarcodes('invoice.jpg')
|
|
96
|
+
const blocks = await ocr('photo.jpg', { format: 'blocks' });
|
|
88
97
|
|
|
89
|
-
// Detect
|
|
90
|
-
const
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
const doc = await detectDocument('photo.jpg') // DocumentBounds | null
|
|
98
|
+
// Detect faces / barcodes / rectangles / document boundary
|
|
99
|
+
const faces = await detectFaces('photo.jpg');
|
|
100
|
+
const codes = await detectBarcodes('invoice.jpg');
|
|
101
|
+
const rects = await detectRectangles('document.jpg');
|
|
102
|
+
const doc = await detectDocument('photo.jpg'); // DocumentBounds | null
|
|
94
103
|
|
|
95
104
|
// Classify image content
|
|
96
|
-
const labels = await classify('photo.jpg')
|
|
105
|
+
const labels = await classify('photo.jpg');
|
|
97
106
|
|
|
98
107
|
// Layout inference — unified reading-order-sorted representation
|
|
99
|
-
const layout = inferLayout({ textBlocks: blocks, faces, barcodes: codes })
|
|
100
|
-
// layout is LayoutBlock[] — ready to feed into a Markdown renderer or LLM context
|
|
108
|
+
const layout = inferLayout({ textBlocks: blocks, faces, barcodes: codes });
|
|
101
109
|
```
|
|
102
110
|
|
|
103
111
|
### Layout inference
|
|
@@ -134,116 +142,173 @@ for (const block of layout) {
|
|
|
134
142
|
|
|
135
143
|
> **Note:** Layout inference is a heuristic layer. It does not understand multi-column layouts or rotated text. Treat it as structured input for downstream tools, not as ground truth.
|
|
136
144
|
|
|
137
|
-
|
|
145
|
+
---
|
|
138
146
|
|
|
139
|
-
|
|
147
|
+
## API — Markdown pipeline (VisionScribe)
|
|
140
148
|
|
|
141
|
-
|
|
149
|
+
`VisionScribe` converts an image or PDF to Markdown by combining Apple Vision OCR with a local LLM (via Ollama). The LLM never sees the image — it only formats text that Vision already extracted. This keeps image processing local and reduces the risk of vision-model hallucinations, but Markdown reconstruction is still best-effort and depends on the local model and document complexity.
|
|
142
150
|
|
|
143
|
-
|
|
144
|
-
|-----------|------|---------|-------------|
|
|
145
|
-
| `imagePath` | `string` | — | Path to image (PNG, JPG, JPEG, WEBP) |
|
|
146
|
-
| `options.format` | `'text' \| 'blocks'` | `'text'` | Plain text or structured blocks with coordinates |
|
|
151
|
+
### Prerequisites
|
|
147
152
|
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
text: string
|
|
153
|
-
x: number // 0–1 from left
|
|
154
|
-
y: number // 0–1 from top
|
|
155
|
-
width: number // 0–1
|
|
156
|
-
height: number // 0–1
|
|
157
|
-
}
|
|
153
|
+
```bash
|
|
154
|
+
brew install ollama
|
|
155
|
+
ollama serve # keep this running
|
|
156
|
+
ollama pull mistral-nemo
|
|
158
157
|
```
|
|
159
158
|
|
|
160
|
-
|
|
159
|
+
### Quick start
|
|
161
160
|
|
|
162
|
-
|
|
161
|
+
```ts
|
|
162
|
+
import { VisionScribe } from 'macos-vision';
|
|
163
163
|
|
|
164
|
-
|
|
164
|
+
const scribe = new VisionScribe();
|
|
165
|
+
const markdown = await scribe.toMarkdown('receipt.png');
|
|
166
|
+
console.log(markdown);
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
For a narrower import surface that pulls in only the markdown sub-module:
|
|
165
170
|
|
|
166
171
|
```ts
|
|
167
|
-
|
|
168
|
-
x: number; y: number; width: number; height: number
|
|
169
|
-
confidence: number // 0–1
|
|
170
|
-
}
|
|
172
|
+
import { VisionScribe } from 'macos-vision/markdown';
|
|
171
173
|
```
|
|
172
174
|
|
|
173
|
-
|
|
175
|
+
### How it works
|
|
174
176
|
|
|
175
|
-
|
|
177
|
+
```
|
|
178
|
+
Image / PDF
|
|
179
|
+
│
|
|
180
|
+
▼
|
|
181
|
+
Apple Vision OCR ← macOS native text extraction
|
|
182
|
+
│ VisionBlock[] per page
|
|
183
|
+
▼
|
|
184
|
+
Per-page layout inference ← each page processed independently (page-local coords)
|
|
185
|
+
│ paragraphId, lineId, y
|
|
186
|
+
▼
|
|
187
|
+
Chunker ← batches paragraphs to fit the LLM output window
|
|
188
|
+
│ ParagraphGroup[][]
|
|
189
|
+
▼
|
|
190
|
+
Ollama /api/chat ← system prompt as role:"system", OCR text as role:"user"
|
|
191
|
+
│ temperature=0, top_p=1, num_predict=-1
|
|
192
|
+
▼
|
|
193
|
+
Markdown string ← chunk results joined with blank lines
|
|
194
|
+
```
|
|
176
195
|
|
|
177
|
-
|
|
196
|
+
The LLM never sees the raw image; it only formats text that Apple Vision has already extracted. The system prompt asks the model to preserve the source text, avoid summarising, and avoid adding content. OCR text is wrapped in `<ocr_source>` tags so the model is less likely to treat document text as user instructions. Per-page processing keeps paragraph coordinates from different pages from being mixed.
|
|
178
197
|
|
|
179
|
-
|
|
180
|
-
interface Barcode {
|
|
181
|
-
type: string // e.g. 'org.iso.QRCode', 'org.gs1.EAN-13'
|
|
182
|
-
value: string // decoded content
|
|
183
|
-
x: number; y: number; width: number; height: number
|
|
184
|
-
}
|
|
185
|
-
```
|
|
198
|
+
### `new VisionScribe(options?)`
|
|
186
199
|
|
|
187
|
-
|
|
200
|
+
| Option | Type | Default | Description |
|
|
201
|
+
|---|---|---|---|
|
|
202
|
+
| `model` | `string` | `'mistral-nemo'` | Ollama model name |
|
|
203
|
+
| `ollamaUrl` | `string` | `'http://localhost:11434'` | Base URL of the Ollama server |
|
|
204
|
+
| `skipPing` | `boolean` | `false` | Skip per-call Ollama health check (useful in batch loops) |
|
|
205
|
+
| `chunkSizeTokens` | `number` | `1800` | Max estimated output tokens per LLM chunk. Lower = more chunks (safer for small models); higher = fewer calls but risks hitting model output limits |
|
|
206
|
+
|
|
207
|
+
### `scribe.toMarkdown(imagePath)`
|
|
188
208
|
|
|
189
|
-
|
|
209
|
+
- Accepts PNG, JPEG, HEIC, HEIF, TIFF, GIF, BMP, WebP and **PDF**
|
|
210
|
+
- Returns an empty string `''` if no text is detected
|
|
211
|
+
- Throws `OllamaUnavailableError` if the Ollama server is not reachable (unless `skipPing: true`)
|
|
190
212
|
|
|
191
|
-
|
|
213
|
+
### Batch processing
|
|
192
214
|
|
|
193
215
|
```ts
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
216
|
+
import { VisionScribe, OllamaUnavailableError } from 'macos-vision';
|
|
217
|
+
|
|
218
|
+
const scribe = new VisionScribe({ skipPing: true });
|
|
219
|
+
|
|
220
|
+
for (const file of files) {
|
|
221
|
+
try {
|
|
222
|
+
const md = await scribe.toMarkdown(file);
|
|
223
|
+
// …
|
|
224
|
+
} catch (e) {
|
|
225
|
+
if (e instanceof OllamaUnavailableError) {
|
|
226
|
+
console.error(e.message);
|
|
227
|
+
break;
|
|
228
|
+
}
|
|
229
|
+
throw e;
|
|
230
|
+
}
|
|
198
231
|
}
|
|
199
232
|
```
|
|
200
233
|
|
|
234
|
+
### Known limitations
|
|
235
|
+
|
|
236
|
+
- **Local model fidelity**: small models (`mistral-nemo`, `gemma`) may occasionally summarise or paraphrase long, dense documents. Larger models (`llama3.1:70b`, `qwen2.5:32b`) produce significantly better fidelity.
|
|
237
|
+
- **Tables**: multi-column table layouts are partially supported. OCR reads cells in reading order but the LLM may not always reconstruct correct Markdown table syntax.
|
|
238
|
+
- **Images / charts**: non-textual content (photos, diagrams, charts) is ignored — only text blocks extracted by Apple Vision are processed.
|
|
239
|
+
- **Markdown fidelity**: the prompt strongly asks for faithful reconstruction, but LLM output is not a cryptographic or deterministic guarantee. Review important legal, financial, or compliance documents before relying on the generated Markdown.
|
|
240
|
+
|
|
201
241
|
---
|
|
202
242
|
|
|
203
|
-
|
|
243
|
+
## Migrating from `macos-vision-md`
|
|
204
244
|
|
|
205
|
-
|
|
245
|
+
The standalone [`macos-vision-md`](https://github.com/woladi/macos-vision-md) package has been merged into `macos-vision` as of v2.0.0. The old package will keep working as a thin re-export shim, but new projects should depend on `macos-vision` directly.
|
|
206
246
|
|
|
207
|
-
```
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
bottomLeft: [number, number]; bottomRight: [number, number]
|
|
211
|
-
confidence: number
|
|
212
|
-
}
|
|
247
|
+
```diff
|
|
248
|
+
- import { VisionScribe } from 'macos-vision-md';
|
|
249
|
+
+ import { VisionScribe } from 'macos-vision';
|
|
213
250
|
```
|
|
214
251
|
|
|
252
|
+
```diff
|
|
253
|
+
- macos-vision-md invoice.pdf -o notes.md
|
|
254
|
+
+ macos-vision --markdown invoice.pdf -o notes.md
|
|
255
|
+
```
|
|
256
|
+
|
|
257
|
+
The `VisionScribe` API, the system prompt, and the chunking strategy are unchanged. `OllamaUnavailableError`, `VisionScribeOptions`, and `ParagraphGroup` are now exported from `macos-vision`.
|
|
258
|
+
|
|
215
259
|
---
|
|
216
260
|
|
|
217
|
-
|
|
261
|
+
## API reference — types
|
|
262
|
+
|
|
263
|
+
### `ocr(imagePath, options?)`
|
|
218
264
|
|
|
219
|
-
|
|
265
|
+
| Parameter | Type | Default | Description |
|
|
266
|
+
|-----------|------|---------|-------------|
|
|
267
|
+
| `imagePath` | `string` | — | Path to image (PNG, JPG, JPEG, WEBP) or PDF |
|
|
268
|
+
| `options.format` | `'text' \| 'blocks'` | `'text'` | Plain text or structured blocks with coordinates |
|
|
269
|
+
|
|
270
|
+
Returns `Promise<string>` or `Promise<VisionBlock[]>`.
|
|
220
271
|
|
|
221
272
|
```ts
|
|
222
|
-
interface
|
|
223
|
-
|
|
224
|
-
|
|
273
|
+
interface VisionBlock {
|
|
274
|
+
text: string
|
|
275
|
+
x: number // 0–1 from left
|
|
276
|
+
y: number // 0–1 from top
|
|
277
|
+
width: number // 0–1
|
|
278
|
+
height: number // 0–1
|
|
279
|
+
confidence: number
|
|
280
|
+
page?: number // 0-based, only for PDFs
|
|
225
281
|
}
|
|
226
282
|
```
|
|
227
283
|
|
|
284
|
+
### `detectFaces(imagePath)` / `detectBarcodes(imagePath)` / `detectRectangles(imagePath)` / `detectDocument(imagePath)` / `classify(imagePath)`
|
|
285
|
+
|
|
286
|
+
See `src/index.ts` for full type declarations.
|
|
287
|
+
|
|
228
288
|
---
|
|
229
289
|
|
|
230
290
|
## Why macos-vision?
|
|
231
291
|
|
|
232
292
|
| | macos-vision | Tesseract.js | Cloud APIs |
|
|
233
293
|
|---|---|---|---|
|
|
234
|
-
| Offline | ✅ | ✅ | ❌ |
|
|
294
|
+
| Offline OCR | ✅ | ✅ | ❌ |
|
|
295
|
+
| Offline image → Markdown | ✅ (with local Ollama) | ❌ | ❌ |
|
|
235
296
|
| No API key | ✅ | ✅ | ❌ |
|
|
236
297
|
| Native speed | ✅ | ❌ | — |
|
|
237
298
|
| Zero runtime deps | ✅ | ❌ | ❌ |
|
|
238
299
|
| OCR with bounding boxes | ✅ | ✅ | ✅ |
|
|
239
|
-
| Face detection | ✅ | ❌ | ✅ |
|
|
240
|
-
| Barcode / QR | ✅ | ❌ | ✅ |
|
|
241
|
-
| Document detection | ✅ | ❌ | ✅ |
|
|
300
|
+
| Face / barcode / document detection | ✅ | ❌ | ✅ |
|
|
242
301
|
| Image classification | ✅ | ❌ | ✅ |
|
|
243
302
|
| macOS only | ✅ | ❌ | ❌ |
|
|
244
303
|
|
|
245
304
|
Apple Vision is the same engine used by macOS Spotlight, Live Text, and Shortcuts — highly optimized and accurate.
|
|
246
305
|
|
|
306
|
+
### OCR evaluation notes
|
|
307
|
+
|
|
308
|
+
In internal tests on anonymized scanned contracts, forms, declarations, and UI screenshots, Apple Vision OCR produced fewer OCR artifacts than Tesseract in most cases. The strongest gains were on multi-column contract-style scans, where Apple Vision preserved substantially more usable text with far fewer artifacts. On simpler UI screenshots, both engines performed similarly.
|
|
309
|
+
|
|
310
|
+
These results are directional rather than a public benchmark suite. The corpus is not included in this repository, and future benchmark fixtures should use synthetic or public-domain documents only.
|
|
311
|
+
|
|
247
312
|
## License
|
|
248
313
|
|
|
249
314
|
MIT
|
package/bin/pdf-helper
CHANGED
|
Binary file
|
package/bin/vision-helper
CHANGED
|
Binary file
|
package/dist/cli.js
CHANGED
|
@@ -1,91 +1,154 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
|
-
import { resolve } from 'path';
|
|
2
|
+
import { resolve, dirname, basename, extname, join } from 'path';
|
|
3
|
+
import { writeFile } from 'fs/promises';
|
|
3
4
|
import { ocr, detectFaces, detectBarcodes, detectRectangles, detectDocument, classify, } from './index.js';
|
|
4
5
|
const USAGE = `
|
|
5
|
-
Usage: vision
|
|
6
|
+
Usage: macos-vision [options] <image-or-pdf>
|
|
6
7
|
|
|
7
|
-
|
|
8
|
-
--ocr
|
|
9
|
-
--blocks
|
|
10
|
-
--faces
|
|
11
|
-
--barcodes
|
|
12
|
-
--rectangles
|
|
13
|
-
--document
|
|
14
|
-
--classify
|
|
15
|
-
--all
|
|
8
|
+
Vision options:
|
|
9
|
+
--ocr OCR — plain text (default)
|
|
10
|
+
--blocks OCR — structured blocks with coordinates
|
|
11
|
+
--faces Face detection
|
|
12
|
+
--barcodes Barcode & QR code detection
|
|
13
|
+
--rectangles Rectangle detection
|
|
14
|
+
--document Document boundary detection
|
|
15
|
+
--classify Image classification
|
|
16
|
+
--all Run all of the above
|
|
16
17
|
|
|
17
|
-
|
|
18
|
+
Markdown options (requires Ollama running locally):
|
|
19
|
+
--markdown Convert image/PDF to Markdown via VisionScribe + Ollama
|
|
20
|
+
--model <name> Ollama model name (default: mistral-nemo)
|
|
21
|
+
--ollama-url <url> Ollama base URL (default: http://localhost:11434)
|
|
22
|
+
-o, --output <path> Write Markdown to specified file
|
|
23
|
+
--stdout Print Markdown to stdout instead of a file
|
|
24
|
+
|
|
25
|
+
--help Show this help
|
|
18
26
|
|
|
19
27
|
Examples:
|
|
20
|
-
vision
|
|
21
|
-
vision
|
|
22
|
-
vision
|
|
28
|
+
macos-vision photo.jpg
|
|
29
|
+
macos-vision --blocks --faces photo.jpg
|
|
30
|
+
macos-vision --all photo.jpg
|
|
31
|
+
macos-vision --markdown invoice.pdf -o notes.md
|
|
32
|
+
macos-vision --markdown receipt.jpg --stdout
|
|
23
33
|
`.trim();
|
|
24
34
|
const rawArgs = process.argv.slice(2);
|
|
25
35
|
if (rawArgs.includes('--help') || rawArgs.length === 0) {
|
|
26
36
|
console.log(USAGE);
|
|
27
37
|
process.exit(0);
|
|
28
38
|
}
|
|
29
|
-
|
|
30
|
-
|
|
39
|
+
// Strip value-bearing options first so the remaining tokens are either
|
|
40
|
+
// boolean flags (`--something`) or positional file paths.
|
|
41
|
+
function takeOpt(name, args) {
|
|
42
|
+
const i = args.indexOf(name);
|
|
43
|
+
if (i === -1)
|
|
44
|
+
return undefined;
|
|
45
|
+
const v = args[i + 1];
|
|
46
|
+
args.splice(i, 2);
|
|
47
|
+
return v;
|
|
48
|
+
}
|
|
49
|
+
const argv = [...rawArgs];
|
|
50
|
+
const model = takeOpt('--model', argv);
|
|
51
|
+
const ollamaUrl = takeOpt('--ollama-url', argv);
|
|
52
|
+
const outPath = takeOpt('-o', argv) ?? takeOpt('--output', argv);
|
|
53
|
+
const flags = new Set(argv.filter((a) => a.startsWith('--')));
|
|
54
|
+
const fileArgs = argv.filter((a) => !a.startsWith('-'));
|
|
31
55
|
if (!fileArgs[0]) {
|
|
32
|
-
console.error('Error: no image path provided.\n');
|
|
56
|
+
console.error('Error: no image or PDF path provided.\n');
|
|
33
57
|
console.log(USAGE);
|
|
34
58
|
process.exit(1);
|
|
35
59
|
}
|
|
36
|
-
const
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
const
|
|
40
|
-
const
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
const
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
flags.has('--classify');
|
|
54
|
-
const useDefault = !anyFeatureFlag;
|
|
55
|
-
async function main() {
|
|
56
|
-
try {
|
|
57
|
-
if (useDefault || runOcr) {
|
|
58
|
-
const text = await ocr(imagePath);
|
|
59
|
-
console.log(text);
|
|
60
|
-
}
|
|
61
|
-
if (runBlocks) {
|
|
62
|
-
const blocks = (await ocr(imagePath, { format: 'blocks' }));
|
|
63
|
-
console.log(JSON.stringify(blocks, null, 2));
|
|
64
|
-
}
|
|
65
|
-
if (runFaces) {
|
|
66
|
-
const faces = (await detectFaces(imagePath));
|
|
67
|
-
console.log(JSON.stringify(faces, null, 2));
|
|
60
|
+
const inputPath = resolve(fileArgs[0]);
|
|
61
|
+
// ─── Markdown pipeline ─────────────────────────────────────────────────────────────
|
|
62
|
+
if (flags.has('--markdown')) {
|
|
63
|
+
const toStdout = flags.has('--stdout');
|
|
64
|
+
const opts = {};
|
|
65
|
+
if (model)
|
|
66
|
+
opts.model = model;
|
|
67
|
+
if (ollamaUrl)
|
|
68
|
+
opts.ollamaUrl = ollamaUrl;
|
|
69
|
+
(async () => {
|
|
70
|
+
const { VisionScribe, OllamaUnavailableError } = await import('./markdown/index.js');
|
|
71
|
+
const scribe = new VisionScribe(opts);
|
|
72
|
+
if (!toStdout)
|
|
73
|
+
process.stderr.write(`Converting ${fileArgs[0]}…\n`);
|
|
74
|
+
let markdown;
|
|
75
|
+
try {
|
|
76
|
+
markdown = await scribe.toMarkdown(inputPath);
|
|
68
77
|
}
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
78
|
+
catch (err) {
|
|
79
|
+
if (err instanceof OllamaUnavailableError) {
|
|
80
|
+
console.error(err.message);
|
|
81
|
+
process.exit(2);
|
|
82
|
+
}
|
|
83
|
+
throw err;
|
|
72
84
|
}
|
|
73
|
-
if (
|
|
74
|
-
|
|
75
|
-
|
|
85
|
+
if (toStdout) {
|
|
86
|
+
process.stdout.write(markdown);
|
|
87
|
+
return;
|
|
76
88
|
}
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
89
|
+
const finalPath = outPath ??
|
|
90
|
+
join(dirname(inputPath), basename(inputPath, extname(inputPath)) + '.md');
|
|
91
|
+
await writeFile(finalPath, markdown, 'utf8');
|
|
92
|
+
process.stderr.write(`Saved: ${finalPath}\n`);
|
|
93
|
+
})().catch((err) => {
|
|
94
|
+
console.error(err instanceof Error ? err.message : String(err));
|
|
95
|
+
process.exit(1);
|
|
96
|
+
});
|
|
97
|
+
}
|
|
98
|
+
else {
|
|
99
|
+
// ─── Vision pipeline (OCR / detections / classification) ───────────────────────
|
|
100
|
+
const runAll = flags.has('--all');
|
|
101
|
+
const runOcr = runAll || flags.has('--ocr');
|
|
102
|
+
const runBlocks = runAll || flags.has('--blocks');
|
|
103
|
+
const runFaces = runAll || flags.has('--faces');
|
|
104
|
+
const runBarcodes = runAll || flags.has('--barcodes');
|
|
105
|
+
const runRects = runAll || flags.has('--rectangles');
|
|
106
|
+
const runDoc = runAll || flags.has('--document');
|
|
107
|
+
const runClassify = runAll || flags.has('--classify');
|
|
108
|
+
// Default: OCR text when no feature flag is given
|
|
109
|
+
const anyFeatureFlag = runAll ||
|
|
110
|
+
flags.has('--ocr') ||
|
|
111
|
+
flags.has('--blocks') ||
|
|
112
|
+
flags.has('--faces') ||
|
|
113
|
+
flags.has('--barcodes') ||
|
|
114
|
+
flags.has('--rectangles') ||
|
|
115
|
+
flags.has('--document') ||
|
|
116
|
+
flags.has('--classify');
|
|
117
|
+
const useDefault = !anyFeatureFlag;
|
|
118
|
+
(async () => {
|
|
119
|
+
try {
|
|
120
|
+
if (useDefault || runOcr) {
|
|
121
|
+
const text = await ocr(inputPath);
|
|
122
|
+
console.log(text);
|
|
123
|
+
}
|
|
124
|
+
if (runBlocks) {
|
|
125
|
+
const blocks = (await ocr(inputPath, { format: 'blocks' }));
|
|
126
|
+
console.log(JSON.stringify(blocks, null, 2));
|
|
127
|
+
}
|
|
128
|
+
if (runFaces) {
|
|
129
|
+
const faces = (await detectFaces(inputPath));
|
|
130
|
+
console.log(JSON.stringify(faces, null, 2));
|
|
131
|
+
}
|
|
132
|
+
if (runBarcodes) {
|
|
133
|
+
const barcodes = (await detectBarcodes(inputPath));
|
|
134
|
+
console.log(JSON.stringify(barcodes, null, 2));
|
|
135
|
+
}
|
|
136
|
+
if (runRects) {
|
|
137
|
+
const rectangles = (await detectRectangles(inputPath));
|
|
138
|
+
console.log(JSON.stringify(rectangles, null, 2));
|
|
139
|
+
}
|
|
140
|
+
if (runDoc) {
|
|
141
|
+
const doc = (await detectDocument(inputPath));
|
|
142
|
+
console.log(JSON.stringify(doc, null, 2));
|
|
143
|
+
}
|
|
144
|
+
if (runClassify) {
|
|
145
|
+
const labels = (await classify(inputPath));
|
|
146
|
+
console.log(JSON.stringify(labels, null, 2));
|
|
147
|
+
}
|
|
80
148
|
}
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
149
|
+
catch (error) {
|
|
150
|
+
console.error('Error:', error);
|
|
151
|
+
process.exit(1);
|
|
84
152
|
}
|
|
85
|
-
}
|
|
86
|
-
catch (error) {
|
|
87
|
-
console.error('Error:', error);
|
|
88
|
-
process.exit(1);
|
|
89
|
-
}
|
|
153
|
+
})();
|
|
90
154
|
}
|
|
91
|
-
main();
|
package/dist/index.d.ts
CHANGED
|
@@ -112,3 +112,5 @@ export interface Classification {
|
|
|
112
112
|
export declare function classify(imagePath: string): Promise<Classification[]>;
|
|
113
113
|
export type { BlockKind, BaseBlock, TextBlock, FaceBlock, BarcodeBlock, RectangleBlock, DocumentBlock, LayoutBlock, InferLayoutInput, } from './layout.js';
|
|
114
114
|
export { inferLayout, sortBlocksByReadingOrder } from './layout.js';
|
|
115
|
+
export { VisionScribe, OllamaUnavailableError } from './markdown/index.js';
|
|
116
|
+
export type { VisionScribeOptions, ParagraphGroup } from './markdown/index.js';
|
package/dist/index.js
CHANGED
|
@@ -15,7 +15,7 @@ async function run(flag, imagePath) {
|
|
|
15
15
|
});
|
|
16
16
|
return stdout;
|
|
17
17
|
}
|
|
18
|
-
// ─── PDF helpers
|
|
18
|
+
// ─── PDF helpers ─────────────────────────────────────────────────────
|
|
19
19
|
/**
|
|
20
20
|
* Returns true if the file at `filePath` is a PDF.
|
|
21
21
|
* Uses extension as a fast path; falls back to magic bytes (`%PDF`) for
|
|
@@ -75,11 +75,11 @@ async function ocrPdf(pdfPath, format) {
|
|
|
75
75
|
export async function ocr(imagePath, options = {}) {
|
|
76
76
|
const absPath = resolve(imagePath);
|
|
77
77
|
const { format = 'text' } = options;
|
|
78
|
-
// ── PDF fast-path: rasterize via sips, then OCR each page
|
|
78
|
+
// ── PDF fast-path: rasterize via sips, then OCR each page ────────────
|
|
79
79
|
if (await isPdf(absPath)) {
|
|
80
80
|
return ocrPdf(absPath, format);
|
|
81
81
|
}
|
|
82
|
-
// ── Existing image path (unchanged)
|
|
82
|
+
// ── Existing image path (unchanged) ─────────────────────────────────
|
|
83
83
|
if (format === 'blocks') {
|
|
84
84
|
const { stdout } = await execFileAsync(BIN_PATH, ['--json', absPath], {
|
|
85
85
|
timeout: BINARY_TIMEOUT_MS,
|
|
@@ -128,3 +128,5 @@ export async function classify(imagePath) {
|
|
|
128
128
|
return raw;
|
|
129
129
|
}
|
|
130
130
|
export { inferLayout, sortBlocksByReadingOrder } from './layout.js';
|
|
131
|
+
// ─── Markdown pipeline (VisionScribe) ──────────────────────────────────────────
|
|
132
|
+
export { VisionScribe, OllamaUnavailableError } from './markdown/index.js';
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
import type { ParagraphGroup } from './prompt.js';
|
|
2
|
+
export declare function estimateTokens(text: string): number;
|
|
3
|
+
/**
|
|
4
|
+
* Split an array of paragraphs into chunks where each chunk's estimated prompt
|
|
5
|
+
* token count stays within `chunkSizeTokens`. Paragraph boundaries are never
|
|
6
|
+
* split — chunks always break between `ParagraphGroup` objects.
|
|
7
|
+
*
|
|
8
|
+
* A paragraph whose estimated token count exceeds the budget on its own is
|
|
9
|
+
* emitted as a singleton chunk with a warning.
|
|
10
|
+
*/
|
|
11
|
+
export declare function chunkParagraphs(paragraphs: ParagraphGroup[], chunkSizeTokens: number): ParagraphGroup[][];
|