macos-vision 1.2.0 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +161 -103
- package/bin/pdf-helper +0 -0
- package/bin/vision-helper +0 -0
- package/dist/cli.js +131 -68
- package/dist/index.d.ts +2 -0
- package/dist/index.js +5 -3
- package/dist/markdown/chunker.d.ts +11 -0
- package/dist/markdown/chunker.js +39 -0
- package/dist/markdown/index.d.ts +61 -0
- package/dist/markdown/index.js +92 -0
- package/dist/markdown/ollama.d.ts +21 -0
- package/dist/markdown/ollama.js +50 -0
- package/dist/markdown/prompt.d.ts +35 -0
- package/dist/markdown/prompt.js +82 -0
- package/package.json +30 -5
- package/src/native/pdf-helper.swift +122 -0
- package/src/native/vision-helper.swift +241 -0
- package/.husky/commit-msg +0 -2
- package/.husky/pre-commit +0 -3
- package/.prettierignore +0 -4
- package/.prettierrc.json +0 -7
- package/.release-it.json +0 -20
- package/CHANGELOG.md +0 -44
- package/commitlint.config.js +0 -1
- package/debug.js +0 -37
- package/eslint.config.js +0 -21
package/README.md
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# macos-vision
|
|
2
2
|
|
|
3
|
-
> Apple Vision for Node.js — native, fast, offline
|
|
3
|
+
> Apple Vision for Node.js — native, fast, offline. Now with an optional Ollama-driven Markdown pipeline.
|
|
4
4
|
|
|
5
5
|
Uses macOS's built-in [Vision framework](https://developer.apple.com/documentation/vision) via a compiled Swift binary. Works completely offline. No cloud services, no API keys, no Python, zero runtime dependencies.
|
|
6
6
|
|
|
@@ -8,11 +8,8 @@ Uses macOS's built-in [Vision framework](https://developer.apple.com/documentati
|
|
|
8
8
|
|
|
9
9
|
- macOS 12+
|
|
10
10
|
- Node.js 18+
|
|
11
|
-
- Xcode Command Line Tools
|
|
12
|
-
|
|
13
|
-
```bash
|
|
14
|
-
xcode-select --install
|
|
15
|
-
```
|
|
11
|
+
- Xcode Command Line Tools (`xcode-select --install`)
|
|
12
|
+
- [Ollama](https://ollama.com) running locally — only if you use the Markdown pipeline
|
|
16
13
|
|
|
17
14
|
## Installation
|
|
18
15
|
|
|
@@ -20,18 +17,18 @@ xcode-select --install
|
|
|
20
17
|
npm install macos-vision
|
|
21
18
|
```
|
|
22
19
|
|
|
23
|
-
The native Swift
|
|
24
|
-
|
|
25
|
-
## What this is (and isn't)
|
|
26
|
-
|
|
27
|
-
`macos-vision` gives you **raw Apple Vision results** — text, coordinates, bounding boxes, labels.
|
|
20
|
+
The native Swift binaries (`vision-helper`, `pdf-helper`) are compiled automatically on install.
|
|
28
21
|
|
|
29
|
-
|
|
30
|
-
- Convert PDFs or images to Markdown
|
|
31
|
-
- Understand document structure (headings, tables, paragraphs)
|
|
32
|
-
- Chain multiple detections into a final report
|
|
22
|
+
## What you get
|
|
33
23
|
|
|
34
|
-
|
|
24
|
+
| Capability | Engine | Network |
|
|
25
|
+
|---|---|---|
|
|
26
|
+
| OCR (text + bounding boxes) | Apple Vision | offline |
|
|
27
|
+
| Face / barcode / rectangle / document detection | Apple Vision | offline |
|
|
28
|
+
| Image classification | Apple Vision | offline |
|
|
29
|
+
| Layout inference (lines, paragraphs, reading order) | heuristic in TypeScript | offline |
|
|
30
|
+
| PDF rasterization | PDFKit (`pdf-helper`) | offline |
|
|
31
|
+
| **Image / PDF → Markdown** | Apple Vision OCR + local LLM via Ollama | local LLM call |
|
|
35
32
|
|
|
36
33
|
---
|
|
37
34
|
|
|
@@ -44,60 +41,71 @@ npx macos-vision photo.jpg
|
|
|
44
41
|
# Structured OCR blocks with bounding boxes
|
|
45
42
|
npx macos-vision --blocks photo.jpg
|
|
46
43
|
|
|
47
|
-
#
|
|
44
|
+
# Detections
|
|
48
45
|
npx macos-vision --faces photo.jpg
|
|
49
|
-
|
|
50
|
-
# Detect barcodes and QR codes
|
|
51
46
|
npx macos-vision --barcodes photo.jpg
|
|
52
|
-
|
|
53
|
-
# Detect rectangular shapes
|
|
54
47
|
npx macos-vision --rectangles photo.jpg
|
|
55
|
-
|
|
56
|
-
# Find document boundary
|
|
57
48
|
npx macos-vision --document photo.jpg
|
|
58
|
-
|
|
59
|
-
# Classify image content
|
|
60
49
|
npx macos-vision --classify photo.jpg
|
|
61
50
|
|
|
62
51
|
# Run all detections at once
|
|
63
52
|
npx macos-vision --all photo.jpg
|
|
53
|
+
|
|
54
|
+
# Image / PDF → Markdown via VisionScribe + Ollama
|
|
55
|
+
npx macos-vision --markdown invoice.pdf -o notes.md
|
|
56
|
+
npx macos-vision --markdown receipt.jpg --stdout
|
|
57
|
+
npx macos-vision --markdown scan.png --model llama3.2
|
|
64
58
|
```
|
|
65
59
|
|
|
66
|
-
Multiple flags can be combined: `npx macos-vision --blocks --faces --classify photo.jpg
|
|
60
|
+
Multiple Vision flags can be combined: `npx macos-vision --blocks --faces --classify photo.jpg`. Structured results are printed as JSON to stdout.
|
|
61
|
+
|
|
62
|
+
### CLI flags
|
|
67
63
|
|
|
68
|
-
|
|
64
|
+
| Flag | Description |
|
|
65
|
+
|---|---|
|
|
66
|
+
| `--ocr` | Plain text OCR (default when no flag is given) |
|
|
67
|
+
| `--blocks` | OCR with bounding boxes (JSON) |
|
|
68
|
+
| `--faces` / `--barcodes` / `--rectangles` / `--document` / `--classify` | Vision detections (JSON) |
|
|
69
|
+
| `--all` | Run every Vision detection at once |
|
|
70
|
+
| `--markdown` | Convert image / PDF to Markdown via VisionScribe + Ollama |
|
|
71
|
+
| `--model <name>` | Ollama model (default: `mistral-nemo`). Only used with `--markdown` |
|
|
72
|
+
| `--ollama-url <url>` | Ollama base URL (default: `http://localhost:11434`). Only used with `--markdown` |
|
|
73
|
+
| `-o`, `--output <path>` | Write Markdown to a file. Only used with `--markdown` |
|
|
74
|
+
| `--stdout` | Print Markdown to stdout instead of a file. Only used with `--markdown` |
|
|
75
|
+
| `--help` | Show usage |
|
|
69
76
|
|
|
70
77
|
---
|
|
71
78
|
|
|
72
|
-
## API
|
|
79
|
+
## API — Vision
|
|
73
80
|
|
|
74
81
|
```js
|
|
75
|
-
import {
|
|
82
|
+
import {
|
|
83
|
+
ocr,
|
|
84
|
+
detectFaces,
|
|
85
|
+
detectBarcodes,
|
|
86
|
+
detectRectangles,
|
|
87
|
+
detectDocument,
|
|
88
|
+
classify,
|
|
89
|
+
inferLayout,
|
|
90
|
+
} from 'macos-vision';
|
|
76
91
|
|
|
77
92
|
// OCR — plain text
|
|
78
|
-
const text = await ocr('photo.jpg')
|
|
93
|
+
const text = await ocr('photo.jpg');
|
|
79
94
|
|
|
80
95
|
// OCR — structured blocks with bounding boxes
|
|
81
|
-
const blocks = await ocr('photo.jpg', { format: 'blocks' })
|
|
82
|
-
|
|
83
|
-
// Detect faces
|
|
84
|
-
const faces = await detectFaces('photo.jpg')
|
|
85
|
-
|
|
86
|
-
// Detect barcodes and QR codes
|
|
87
|
-
const codes = await detectBarcodes('invoice.jpg')
|
|
88
|
-
|
|
89
|
-
// Detect rectangular shapes (tables, forms, cards)
|
|
90
|
-
const rects = await detectRectangles('document.jpg')
|
|
96
|
+
const blocks = await ocr('photo.jpg', { format: 'blocks' });
|
|
91
97
|
|
|
92
|
-
//
|
|
93
|
-
const
|
|
98
|
+
// Detect faces / barcodes / rectangles / document boundary
|
|
99
|
+
const faces = await detectFaces('photo.jpg');
|
|
100
|
+
const codes = await detectBarcodes('invoice.jpg');
|
|
101
|
+
const rects = await detectRectangles('document.jpg');
|
|
102
|
+
const doc = await detectDocument('photo.jpg'); // DocumentBounds | null
|
|
94
103
|
|
|
95
104
|
// Classify image content
|
|
96
|
-
const labels = await classify('photo.jpg')
|
|
105
|
+
const labels = await classify('photo.jpg');
|
|
97
106
|
|
|
98
107
|
// Layout inference — unified reading-order-sorted representation
|
|
99
|
-
const layout = inferLayout({ textBlocks: blocks, faces, barcodes: codes })
|
|
100
|
-
// layout is LayoutBlock[] — ready to feed into a Markdown renderer or LLM context
|
|
108
|
+
const layout = inferLayout({ textBlocks: blocks, faces, barcodes: codes });
|
|
101
109
|
```
|
|
102
110
|
|
|
103
111
|
### Layout inference
|
|
@@ -134,111 +142,161 @@ for (const block of layout) {
|
|
|
134
142
|
|
|
135
143
|
> **Note:** Layout inference is a heuristic layer. It does not understand multi-column layouts or rotated text. Treat it as structured input for downstream tools, not as ground truth.
|
|
136
144
|
|
|
137
|
-
|
|
145
|
+
---
|
|
138
146
|
|
|
139
|
-
|
|
147
|
+
## API — Markdown pipeline (VisionScribe)
|
|
140
148
|
|
|
141
|
-
|
|
149
|
+
`VisionScribe` converts an image or PDF to Markdown by combining Apple Vision OCR with a local LLM (via Ollama). The LLM never sees the image — it only formats text that Vision already extracted, which keeps the pipeline deterministic and prevents the hallucinations typical of cloud vision APIs.
|
|
142
150
|
|
|
143
|
-
|
|
144
|
-
|-----------|------|---------|-------------|
|
|
145
|
-
| `imagePath` | `string` | — | Path to image (PNG, JPG, JPEG, WEBP) |
|
|
146
|
-
| `options.format` | `'text' \| 'blocks'` | `'text'` | Plain text or structured blocks with coordinates |
|
|
147
|
-
|
|
148
|
-
Returns `Promise<string>` or `Promise<VisionBlock[]>`.
|
|
151
|
+
### Prerequisites
|
|
149
152
|
|
|
150
|
-
```
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
y: number // 0–1 from top
|
|
155
|
-
width: number // 0–1
|
|
156
|
-
height: number // 0–1
|
|
157
|
-
}
|
|
153
|
+
```bash
|
|
154
|
+
brew install ollama
|
|
155
|
+
ollama serve # keep this running
|
|
156
|
+
ollama pull mistral-nemo
|
|
158
157
|
```
|
|
159
158
|
|
|
160
|
-
|
|
159
|
+
### Quick start
|
|
161
160
|
|
|
162
|
-
|
|
161
|
+
```ts
|
|
162
|
+
import { VisionScribe } from 'macos-vision';
|
|
163
|
+
|
|
164
|
+
const scribe = new VisionScribe();
|
|
165
|
+
const markdown = await scribe.toMarkdown('receipt.png');
|
|
166
|
+
console.log(markdown);
|
|
167
|
+
```
|
|
163
168
|
|
|
164
|
-
|
|
169
|
+
For a narrower import surface that pulls in only the markdown sub-module:
|
|
165
170
|
|
|
166
171
|
```ts
|
|
167
|
-
|
|
168
|
-
x: number; y: number; width: number; height: number
|
|
169
|
-
confidence: number // 0–1
|
|
170
|
-
}
|
|
172
|
+
import { VisionScribe } from 'macos-vision/markdown';
|
|
171
173
|
```
|
|
172
174
|
|
|
173
|
-
|
|
175
|
+
### How it works
|
|
174
176
|
|
|
175
|
-
|
|
177
|
+
```
|
|
178
|
+
Image / PDF
|
|
179
|
+
│
|
|
180
|
+
▼
|
|
181
|
+
Apple Vision OCR ← macOS native, deterministic, zero hallucination
|
|
182
|
+
│ VisionBlock[] per page
|
|
183
|
+
▼
|
|
184
|
+
Per-page layout inference ← each page processed independently (page-local coords)
|
|
185
|
+
│ paragraphId, lineId, y
|
|
186
|
+
▼
|
|
187
|
+
Chunker ← batches paragraphs to fit the LLM output window
|
|
188
|
+
│ ParagraphGroup[][]
|
|
189
|
+
▼
|
|
190
|
+
Ollama /api/chat ← system prompt as role:"system", OCR text as role:"user"
|
|
191
|
+
│ temperature=0, top_p=1, num_predict=-1
|
|
192
|
+
▼
|
|
193
|
+
Markdown string ← chunk results joined with blank lines
|
|
194
|
+
```
|
|
176
195
|
|
|
177
|
-
|
|
196
|
+
The LLM never sees the raw image; it only formats text that Apple Vision has already extracted. The system prompt instructs the model to act as a high-fidelity document parser and explicitly forbids summarising, paraphrasing, or adding content. OCR text is wrapped in `<ocr_source>` tags so the model cannot mistake it for a user asking a question. Per-page processing keeps paragraph coordinates from different pages from being mixed.
|
|
178
197
|
|
|
179
|
-
|
|
180
|
-
interface Barcode {
|
|
181
|
-
type: string // e.g. 'org.iso.QRCode', 'org.gs1.EAN-13'
|
|
182
|
-
value: string // decoded content
|
|
183
|
-
x: number; y: number; width: number; height: number
|
|
184
|
-
}
|
|
185
|
-
```
|
|
198
|
+
### `new VisionScribe(options?)`
|
|
186
199
|
|
|
187
|
-
|
|
200
|
+
| Option | Type | Default | Description |
|
|
201
|
+
|---|---|---|---|
|
|
202
|
+
| `model` | `string` | `'mistral-nemo'` | Ollama model name |
|
|
203
|
+
| `ollamaUrl` | `string` | `'http://localhost:11434'` | Base URL of the Ollama server |
|
|
204
|
+
| `skipPing` | `boolean` | `false` | Skip per-call Ollama health check (useful in batch loops) |
|
|
205
|
+
| `chunkSizeTokens` | `number` | `1800` | Max estimated output tokens per LLM chunk. Lower = more chunks (safer for small models); higher = fewer calls but risks hitting model output limits |
|
|
206
|
+
|
|
207
|
+
### `scribe.toMarkdown(imagePath)`
|
|
188
208
|
|
|
189
|
-
|
|
209
|
+
- Accepts PNG, JPEG, HEIC, HEIF, TIFF, GIF, BMP, WebP and **PDF**
|
|
210
|
+
- Returns an empty string `''` if no text is detected
|
|
211
|
+
- Throws `OllamaUnavailableError` if the Ollama server is not reachable (unless `skipPing: true`)
|
|
190
212
|
|
|
191
|
-
|
|
213
|
+
### Batch processing
|
|
192
214
|
|
|
193
215
|
```ts
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
216
|
+
import { VisionScribe, OllamaUnavailableError } from 'macos-vision';
|
|
217
|
+
|
|
218
|
+
const scribe = new VisionScribe({ skipPing: true });
|
|
219
|
+
|
|
220
|
+
for (const file of files) {
|
|
221
|
+
try {
|
|
222
|
+
const md = await scribe.toMarkdown(file);
|
|
223
|
+
// …
|
|
224
|
+
} catch (e) {
|
|
225
|
+
if (e instanceof OllamaUnavailableError) {
|
|
226
|
+
console.error(e.message);
|
|
227
|
+
break;
|
|
228
|
+
}
|
|
229
|
+
throw e;
|
|
230
|
+
}
|
|
198
231
|
}
|
|
199
232
|
```
|
|
200
233
|
|
|
234
|
+
### Known limitations
|
|
235
|
+
|
|
236
|
+
- **Local model fidelity**: small models (`mistral-nemo`, `gemma`) may occasionally summarise or paraphrase long, dense documents. Larger models (`llama3.1:70b`, `qwen2.5:32b`) produce significantly better fidelity.
|
|
237
|
+
- **Tables**: multi-column table layouts are partially supported. OCR reads cells in reading order but the LLM may not always reconstruct correct Markdown table syntax.
|
|
238
|
+
- **Images / charts**: non-textual content (photos, diagrams, charts) is ignored — only text blocks extracted by Apple Vision are processed.
|
|
239
|
+
|
|
201
240
|
---
|
|
202
241
|
|
|
203
|
-
|
|
242
|
+
## Migrating from `macos-vision-md`
|
|
204
243
|
|
|
205
|
-
|
|
244
|
+
The standalone [`macos-vision-md`](https://github.com/woladi/macos-vision-md) package has been merged into `macos-vision` as of v2.0.0. The old package will keep working as a thin re-export shim, but new projects should depend on `macos-vision` directly.
|
|
206
245
|
|
|
207
|
-
```
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
246
|
+
```diff
|
|
247
|
+
- import { VisionScribe } from 'macos-vision-md';
|
|
248
|
+
+ import { VisionScribe } from 'macos-vision';
|
|
249
|
+
```
|
|
250
|
+
|
|
251
|
+
```diff
|
|
252
|
+
- macos-vision-md invoice.pdf -o notes.md
|
|
253
|
+
+ macos-vision --markdown invoice.pdf -o notes.md
|
|
213
254
|
```
|
|
214
255
|
|
|
256
|
+
The `VisionScribe` API, the system prompt, and the chunking strategy are unchanged. `OllamaUnavailableError`, `VisionScribeOptions`, and `ParagraphGroup` are now exported from `macos-vision`.
|
|
257
|
+
|
|
215
258
|
---
|
|
216
259
|
|
|
217
|
-
|
|
260
|
+
## API reference — types
|
|
218
261
|
|
|
219
|
-
|
|
262
|
+
### `ocr(imagePath, options?)`
|
|
263
|
+
|
|
264
|
+
| Parameter | Type | Default | Description |
|
|
265
|
+
|-----------|------|---------|-------------|
|
|
266
|
+
| `imagePath` | `string` | — | Path to image (PNG, JPG, JPEG, WEBP) or PDF |
|
|
267
|
+
| `options.format` | `'text' \| 'blocks'` | `'text'` | Plain text or structured blocks with coordinates |
|
|
268
|
+
|
|
269
|
+
Returns `Promise<string>` or `Promise<VisionBlock[]>`.
|
|
220
270
|
|
|
221
271
|
```ts
|
|
222
|
-
interface
|
|
223
|
-
|
|
224
|
-
|
|
272
|
+
interface VisionBlock {
|
|
273
|
+
text: string
|
|
274
|
+
x: number // 0–1 from left
|
|
275
|
+
y: number // 0–1 from top
|
|
276
|
+
width: number // 0–1
|
|
277
|
+
height: number // 0–1
|
|
278
|
+
confidence: number
|
|
279
|
+
page?: number // 0-based, only for PDFs
|
|
225
280
|
}
|
|
226
281
|
```
|
|
227
282
|
|
|
283
|
+
### `detectFaces(imagePath)` / `detectBarcodes(imagePath)` / `detectRectangles(imagePath)` / `detectDocument(imagePath)` / `classify(imagePath)`
|
|
284
|
+
|
|
285
|
+
See `src/index.ts` for full type declarations.
|
|
286
|
+
|
|
228
287
|
---
|
|
229
288
|
|
|
230
289
|
## Why macos-vision?
|
|
231
290
|
|
|
232
291
|
| | macos-vision | Tesseract.js | Cloud APIs |
|
|
233
292
|
|---|---|---|---|
|
|
234
|
-
| Offline | ✅ | ✅ | ❌ |
|
|
293
|
+
| Offline OCR | ✅ | ✅ | ❌ |
|
|
294
|
+
| Offline image → Markdown | ✅ (with local Ollama) | ❌ | ❌ |
|
|
235
295
|
| No API key | ✅ | ✅ | ❌ |
|
|
236
296
|
| Native speed | ✅ | ❌ | — |
|
|
237
297
|
| Zero runtime deps | ✅ | ❌ | ❌ |
|
|
238
298
|
| OCR with bounding boxes | ✅ | ✅ | ✅ |
|
|
239
|
-
| Face detection | ✅ | ❌ | ✅ |
|
|
240
|
-
| Barcode / QR | ✅ | ❌ | ✅ |
|
|
241
|
-
| Document detection | ✅ | ❌ | ✅ |
|
|
299
|
+
| Face / barcode / document detection | ✅ | ❌ | ✅ |
|
|
242
300
|
| Image classification | ✅ | ❌ | ✅ |
|
|
243
301
|
| macOS only | ✅ | ❌ | ❌ |
|
|
244
302
|
|
package/bin/pdf-helper
CHANGED
|
Binary file
|
package/bin/vision-helper
CHANGED
|
Binary file
|
package/dist/cli.js
CHANGED
|
@@ -1,91 +1,154 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
|
-
import { resolve } from 'path';
|
|
2
|
+
import { resolve, dirname, basename, extname, join } from 'path';
|
|
3
|
+
import { writeFile } from 'fs/promises';
|
|
3
4
|
import { ocr, detectFaces, detectBarcodes, detectRectangles, detectDocument, classify, } from './index.js';
|
|
4
5
|
const USAGE = `
|
|
5
|
-
Usage: vision
|
|
6
|
+
Usage: macos-vision [options] <image-or-pdf>
|
|
6
7
|
|
|
7
|
-
|
|
8
|
-
--ocr
|
|
9
|
-
--blocks
|
|
10
|
-
--faces
|
|
11
|
-
--barcodes
|
|
12
|
-
--rectangles
|
|
13
|
-
--document
|
|
14
|
-
--classify
|
|
15
|
-
--all
|
|
8
|
+
Vision options:
|
|
9
|
+
--ocr OCR — plain text (default)
|
|
10
|
+
--blocks OCR — structured blocks with coordinates
|
|
11
|
+
--faces Face detection
|
|
12
|
+
--barcodes Barcode & QR code detection
|
|
13
|
+
--rectangles Rectangle detection
|
|
14
|
+
--document Document boundary detection
|
|
15
|
+
--classify Image classification
|
|
16
|
+
--all Run all of the above
|
|
16
17
|
|
|
17
|
-
|
|
18
|
+
Markdown options (requires Ollama running locally):
|
|
19
|
+
--markdown Convert image/PDF to Markdown via VisionScribe + Ollama
|
|
20
|
+
--model <name> Ollama model name (default: mistral-nemo)
|
|
21
|
+
--ollama-url <url> Ollama base URL (default: http://localhost:11434)
|
|
22
|
+
-o, --output <path> Write Markdown to specified file
|
|
23
|
+
--stdout Print Markdown to stdout instead of a file
|
|
24
|
+
|
|
25
|
+
--help Show this help
|
|
18
26
|
|
|
19
27
|
Examples:
|
|
20
|
-
vision
|
|
21
|
-
vision
|
|
22
|
-
vision
|
|
28
|
+
macos-vision photo.jpg
|
|
29
|
+
macos-vision --blocks --faces photo.jpg
|
|
30
|
+
macos-vision --all photo.jpg
|
|
31
|
+
macos-vision --markdown invoice.pdf -o notes.md
|
|
32
|
+
macos-vision --markdown receipt.jpg --stdout
|
|
23
33
|
`.trim();
|
|
24
34
|
const rawArgs = process.argv.slice(2);
|
|
25
35
|
if (rawArgs.includes('--help') || rawArgs.length === 0) {
|
|
26
36
|
console.log(USAGE);
|
|
27
37
|
process.exit(0);
|
|
28
38
|
}
|
|
29
|
-
|
|
30
|
-
|
|
39
|
+
// Strip value-bearing options first so the remaining tokens are either
|
|
40
|
+
// boolean flags (`--something`) or positional file paths.
|
|
41
|
+
function takeOpt(name, args) {
|
|
42
|
+
const i = args.indexOf(name);
|
|
43
|
+
if (i === -1)
|
|
44
|
+
return undefined;
|
|
45
|
+
const v = args[i + 1];
|
|
46
|
+
args.splice(i, 2);
|
|
47
|
+
return v;
|
|
48
|
+
}
|
|
49
|
+
const argv = [...rawArgs];
|
|
50
|
+
const model = takeOpt('--model', argv);
|
|
51
|
+
const ollamaUrl = takeOpt('--ollama-url', argv);
|
|
52
|
+
const outPath = takeOpt('-o', argv) ?? takeOpt('--output', argv);
|
|
53
|
+
const flags = new Set(argv.filter((a) => a.startsWith('--')));
|
|
54
|
+
const fileArgs = argv.filter((a) => !a.startsWith('-'));
|
|
31
55
|
if (!fileArgs[0]) {
|
|
32
|
-
console.error('Error: no image path provided.\n');
|
|
56
|
+
console.error('Error: no image or PDF path provided.\n');
|
|
33
57
|
console.log(USAGE);
|
|
34
58
|
process.exit(1);
|
|
35
59
|
}
|
|
36
|
-
const
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
const
|
|
40
|
-
const
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
const
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
flags.has('--classify');
|
|
54
|
-
const useDefault = !anyFeatureFlag;
|
|
55
|
-
async function main() {
|
|
56
|
-
try {
|
|
57
|
-
if (useDefault || runOcr) {
|
|
58
|
-
const text = await ocr(imagePath);
|
|
59
|
-
console.log(text);
|
|
60
|
-
}
|
|
61
|
-
if (runBlocks) {
|
|
62
|
-
const blocks = (await ocr(imagePath, { format: 'blocks' }));
|
|
63
|
-
console.log(JSON.stringify(blocks, null, 2));
|
|
64
|
-
}
|
|
65
|
-
if (runFaces) {
|
|
66
|
-
const faces = (await detectFaces(imagePath));
|
|
67
|
-
console.log(JSON.stringify(faces, null, 2));
|
|
60
|
+
const inputPath = resolve(fileArgs[0]);
|
|
61
|
+
// ─── Markdown pipeline ─────────────────────────────────────────────────────────────
|
|
62
|
+
if (flags.has('--markdown')) {
|
|
63
|
+
const toStdout = flags.has('--stdout');
|
|
64
|
+
const opts = {};
|
|
65
|
+
if (model)
|
|
66
|
+
opts.model = model;
|
|
67
|
+
if (ollamaUrl)
|
|
68
|
+
opts.ollamaUrl = ollamaUrl;
|
|
69
|
+
(async () => {
|
|
70
|
+
const { VisionScribe, OllamaUnavailableError } = await import('./markdown/index.js');
|
|
71
|
+
const scribe = new VisionScribe(opts);
|
|
72
|
+
if (!toStdout)
|
|
73
|
+
process.stderr.write(`Converting ${fileArgs[0]}…\n`);
|
|
74
|
+
let markdown;
|
|
75
|
+
try {
|
|
76
|
+
markdown = await scribe.toMarkdown(inputPath);
|
|
68
77
|
}
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
78
|
+
catch (err) {
|
|
79
|
+
if (err instanceof OllamaUnavailableError) {
|
|
80
|
+
console.error(err.message);
|
|
81
|
+
process.exit(2);
|
|
82
|
+
}
|
|
83
|
+
throw err;
|
|
72
84
|
}
|
|
73
|
-
if (
|
|
74
|
-
|
|
75
|
-
|
|
85
|
+
if (toStdout) {
|
|
86
|
+
process.stdout.write(markdown);
|
|
87
|
+
return;
|
|
76
88
|
}
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
89
|
+
const finalPath = outPath ??
|
|
90
|
+
join(dirname(inputPath), basename(inputPath, extname(inputPath)) + '.md');
|
|
91
|
+
await writeFile(finalPath, markdown, 'utf8');
|
|
92
|
+
process.stderr.write(`Saved: ${finalPath}\n`);
|
|
93
|
+
})().catch((err) => {
|
|
94
|
+
console.error(err instanceof Error ? err.message : String(err));
|
|
95
|
+
process.exit(1);
|
|
96
|
+
});
|
|
97
|
+
}
|
|
98
|
+
else {
|
|
99
|
+
// ─── Vision pipeline (OCR / detections / classification) ───────────────────────
|
|
100
|
+
const runAll = flags.has('--all');
|
|
101
|
+
const runOcr = runAll || flags.has('--ocr');
|
|
102
|
+
const runBlocks = runAll || flags.has('--blocks');
|
|
103
|
+
const runFaces = runAll || flags.has('--faces');
|
|
104
|
+
const runBarcodes = runAll || flags.has('--barcodes');
|
|
105
|
+
const runRects = runAll || flags.has('--rectangles');
|
|
106
|
+
const runDoc = runAll || flags.has('--document');
|
|
107
|
+
const runClassify = runAll || flags.has('--classify');
|
|
108
|
+
// Default: OCR text when no feature flag is given
|
|
109
|
+
const anyFeatureFlag = runAll ||
|
|
110
|
+
flags.has('--ocr') ||
|
|
111
|
+
flags.has('--blocks') ||
|
|
112
|
+
flags.has('--faces') ||
|
|
113
|
+
flags.has('--barcodes') ||
|
|
114
|
+
flags.has('--rectangles') ||
|
|
115
|
+
flags.has('--document') ||
|
|
116
|
+
flags.has('--classify');
|
|
117
|
+
const useDefault = !anyFeatureFlag;
|
|
118
|
+
(async () => {
|
|
119
|
+
try {
|
|
120
|
+
if (useDefault || runOcr) {
|
|
121
|
+
const text = await ocr(inputPath);
|
|
122
|
+
console.log(text);
|
|
123
|
+
}
|
|
124
|
+
if (runBlocks) {
|
|
125
|
+
const blocks = (await ocr(inputPath, { format: 'blocks' }));
|
|
126
|
+
console.log(JSON.stringify(blocks, null, 2));
|
|
127
|
+
}
|
|
128
|
+
if (runFaces) {
|
|
129
|
+
const faces = (await detectFaces(inputPath));
|
|
130
|
+
console.log(JSON.stringify(faces, null, 2));
|
|
131
|
+
}
|
|
132
|
+
if (runBarcodes) {
|
|
133
|
+
const barcodes = (await detectBarcodes(inputPath));
|
|
134
|
+
console.log(JSON.stringify(barcodes, null, 2));
|
|
135
|
+
}
|
|
136
|
+
if (runRects) {
|
|
137
|
+
const rectangles = (await detectRectangles(inputPath));
|
|
138
|
+
console.log(JSON.stringify(rectangles, null, 2));
|
|
139
|
+
}
|
|
140
|
+
if (runDoc) {
|
|
141
|
+
const doc = (await detectDocument(inputPath));
|
|
142
|
+
console.log(JSON.stringify(doc, null, 2));
|
|
143
|
+
}
|
|
144
|
+
if (runClassify) {
|
|
145
|
+
const labels = (await classify(inputPath));
|
|
146
|
+
console.log(JSON.stringify(labels, null, 2));
|
|
147
|
+
}
|
|
80
148
|
}
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
149
|
+
catch (error) {
|
|
150
|
+
console.error('Error:', error);
|
|
151
|
+
process.exit(1);
|
|
84
152
|
}
|
|
85
|
-
}
|
|
86
|
-
catch (error) {
|
|
87
|
-
console.error('Error:', error);
|
|
88
|
-
process.exit(1);
|
|
89
|
-
}
|
|
153
|
+
})();
|
|
90
154
|
}
|
|
91
|
-
main();
|
package/dist/index.d.ts
CHANGED
|
@@ -112,3 +112,5 @@ export interface Classification {
|
|
|
112
112
|
export declare function classify(imagePath: string): Promise<Classification[]>;
|
|
113
113
|
export type { BlockKind, BaseBlock, TextBlock, FaceBlock, BarcodeBlock, RectangleBlock, DocumentBlock, LayoutBlock, InferLayoutInput, } from './layout.js';
|
|
114
114
|
export { inferLayout, sortBlocksByReadingOrder } from './layout.js';
|
|
115
|
+
export { VisionScribe, OllamaUnavailableError } from './markdown/index.js';
|
|
116
|
+
export type { VisionScribeOptions, ParagraphGroup } from './markdown/index.js';
|
package/dist/index.js
CHANGED
|
@@ -15,7 +15,7 @@ async function run(flag, imagePath) {
|
|
|
15
15
|
});
|
|
16
16
|
return stdout;
|
|
17
17
|
}
|
|
18
|
-
// ─── PDF helpers
|
|
18
|
+
// ─── PDF helpers ─────────────────────────────────────────────────────
|
|
19
19
|
/**
|
|
20
20
|
* Returns true if the file at `filePath` is a PDF.
|
|
21
21
|
* Uses extension as a fast path; falls back to magic bytes (`%PDF`) for
|
|
@@ -75,11 +75,11 @@ async function ocrPdf(pdfPath, format) {
|
|
|
75
75
|
export async function ocr(imagePath, options = {}) {
|
|
76
76
|
const absPath = resolve(imagePath);
|
|
77
77
|
const { format = 'text' } = options;
|
|
78
|
-
// ── PDF fast-path: rasterize via sips, then OCR each page
|
|
78
|
+
// ── PDF fast-path: rasterize via sips, then OCR each page ────────────
|
|
79
79
|
if (await isPdf(absPath)) {
|
|
80
80
|
return ocrPdf(absPath, format);
|
|
81
81
|
}
|
|
82
|
-
// ── Existing image path (unchanged)
|
|
82
|
+
// ── Existing image path (unchanged) ─────────────────────────────────
|
|
83
83
|
if (format === 'blocks') {
|
|
84
84
|
const { stdout } = await execFileAsync(BIN_PATH, ['--json', absPath], {
|
|
85
85
|
timeout: BINARY_TIMEOUT_MS,
|
|
@@ -128,3 +128,5 @@ export async function classify(imagePath) {
|
|
|
128
128
|
return raw;
|
|
129
129
|
}
|
|
130
130
|
export { inferLayout, sortBlocksByReadingOrder } from './layout.js';
|
|
131
|
+
// ─── Markdown pipeline (VisionScribe) ──────────────────────────────────────────
|
|
132
|
+
export { VisionScribe, OllamaUnavailableError } from './markdown/index.js';
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
import type { ParagraphGroup } from './prompt.js';
|
|
2
|
+
export declare function estimateTokens(text: string): number;
|
|
3
|
+
/**
|
|
4
|
+
* Split an array of paragraphs into chunks where each chunk's estimated prompt
|
|
5
|
+
* token count stays within `chunkSizeTokens`. Paragraph boundaries are never
|
|
6
|
+
* split — chunks always break between `ParagraphGroup` objects.
|
|
7
|
+
*
|
|
8
|
+
* A paragraph whose estimated token count exceeds the budget on its own is
|
|
9
|
+
* emitted as a singleton chunk with a warning.
|
|
10
|
+
*/
|
|
11
|
+
export declare function chunkParagraphs(paragraphs: ParagraphGroup[], chunkSizeTokens: number): ParagraphGroup[][];
|