macos-vision 1.2.0 → 1.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -1,6 +1,6 @@
1
1
  # macos-vision
2
2
 
3
- > Apple Vision for Node.js — native, fast, offline, no API keys required.
3
+ > Apple Vision for Node.js — native, fast, offline. Now with an optional Ollama-driven Markdown pipeline.
4
4
 
5
5
  Uses macOS's built-in [Vision framework](https://developer.apple.com/documentation/vision) via a compiled Swift binary. Works completely offline. No cloud services, no API keys, no Python, zero runtime dependencies.
6
6
 
@@ -8,11 +8,8 @@ Uses macOS's built-in [Vision framework](https://developer.apple.com/documentati
8
8
 
9
9
  - macOS 12+
10
10
  - Node.js 18+
11
- - Xcode Command Line Tools
12
-
13
- ```bash
14
- xcode-select --install
15
- ```
11
+ - Xcode Command Line Tools (`xcode-select --install`)
12
+ - [Ollama](https://ollama.com) running locally — only if you use the Markdown pipeline
16
13
 
17
14
  ## Installation
18
15
 
@@ -20,18 +17,18 @@ xcode-select --install
20
17
  npm install macos-vision
21
18
  ```
22
19
 
23
- The native Swift binary is compiled automatically on install.
24
-
25
- ## What this is (and isn't)
26
-
27
- `macos-vision` gives you **raw Apple Vision results** — text, coordinates, bounding boxes, labels.
20
+ The native Swift binaries (`vision-helper`, `pdf-helper`) are compiled automatically on install.
28
21
 
29
- It is **not** a document pipeline. It does not:
30
- - Convert PDFs or images to Markdown
31
- - Understand document structure (headings, tables, paragraphs)
32
- - Chain multiple detections into a final report
22
+ ## What you get
33
23
 
34
- For those use cases, use the raw output as input to an LLM or a post-processing layer of your own.
24
+ | Capability | Engine | Network |
25
+ |---|---|---|
26
+ | OCR (text + bounding boxes) | Apple Vision | offline |
27
+ | Face / barcode / rectangle / document detection | Apple Vision | offline |
28
+ | Image classification | Apple Vision | offline |
29
+ | Layout inference (lines, paragraphs, reading order) | heuristic in TypeScript | offline |
30
+ | PDF rasterization | PDFKit (`pdf-helper`) | offline |
31
+ | **Image / PDF → Markdown** | Apple Vision OCR + local LLM via Ollama | local LLM call |
35
32
 
36
33
  ---
37
34
 
@@ -44,60 +41,71 @@ npx macos-vision photo.jpg
44
41
  # Structured OCR blocks with bounding boxes
45
42
  npx macos-vision --blocks photo.jpg
46
43
 
47
- # Detect faces
44
+ # Detections
48
45
  npx macos-vision --faces photo.jpg
49
-
50
- # Detect barcodes and QR codes
51
46
  npx macos-vision --barcodes photo.jpg
52
-
53
- # Detect rectangular shapes
54
47
  npx macos-vision --rectangles photo.jpg
55
-
56
- # Find document boundary
57
48
  npx macos-vision --document photo.jpg
58
-
59
- # Classify image content
60
49
  npx macos-vision --classify photo.jpg
61
50
 
62
51
  # Run all detections at once
63
52
  npx macos-vision --all photo.jpg
53
+
54
+ # Image / PDF → Markdown via VisionScribe + Ollama
55
+ npx macos-vision --markdown invoice.pdf -o notes.md
56
+ npx macos-vision --markdown receipt.jpg --stdout
57
+ npx macos-vision --markdown scan.png --model llama3.2
64
58
  ```
65
59
 
66
- Multiple flags can be combined: `npx macos-vision --blocks --faces --classify photo.jpg`
60
+ Multiple Vision flags can be combined: `npx macos-vision --blocks --faces --classify photo.jpg`. Structured results are printed as JSON to stdout.
67
61
 
68
- Structured results are printed as JSON to stdout.
62
+ ### CLI flags
63
+
64
+ | Flag | Description |
65
+ |---|---|
66
+ | `--ocr` | Plain text OCR (default when no flag is given) |
67
+ | `--blocks` | OCR with bounding boxes (JSON) |
68
+ | `--faces` / `--barcodes` / `--rectangles` / `--document` / `--classify` | Vision detections (JSON) |
69
+ | `--all` | Run every Vision detection at once |
70
+ | `--markdown` | Convert image / PDF to Markdown via VisionScribe + Ollama |
71
+ | `--model <name>` | Ollama model (default: `mistral-nemo`). Only used with `--markdown` |
72
+ | `--ollama-url <url>` | Ollama base URL (default: `http://localhost:11434`). Only used with `--markdown` |
73
+ | `-o`, `--output <path>` | Write Markdown to a file. Only used with `--markdown` |
74
+ | `--stdout` | Print Markdown to stdout instead of a file. Only used with `--markdown` |
75
+ | `--help` | Show usage |
69
76
 
70
77
  ---
71
78
 
72
- ## API
79
+ ## API — Vision
73
80
 
74
81
  ```js
75
- import { ocr, detectFaces, detectBarcodes, detectRectangles, detectDocument, classify } from 'macos-vision'
82
+ import {
83
+ ocr,
84
+ detectFaces,
85
+ detectBarcodes,
86
+ detectRectangles,
87
+ detectDocument,
88
+ classify,
89
+ inferLayout,
90
+ } from 'macos-vision';
76
91
 
77
92
  // OCR — plain text
78
- const text = await ocr('photo.jpg')
93
+ const text = await ocr('photo.jpg');
79
94
 
80
95
  // OCR — structured blocks with bounding boxes
81
- const blocks = await ocr('photo.jpg', { format: 'blocks' })
82
-
83
- // Detect faces
84
- const faces = await detectFaces('photo.jpg')
85
-
86
- // Detect barcodes and QR codes
87
- const codes = await detectBarcodes('invoice.jpg')
96
+ const blocks = await ocr('photo.jpg', { format: 'blocks' });
88
97
 
89
- // Detect rectangular shapes (tables, forms, cards)
90
- const rects = await detectRectangles('document.jpg')
91
-
92
- // Find document boundary in a photo
93
- const doc = await detectDocument('photo.jpg') // DocumentBounds | null
98
+ // Detect faces / barcodes / rectangles / document boundary
99
+ const faces = await detectFaces('photo.jpg');
100
+ const codes = await detectBarcodes('invoice.jpg');
101
+ const rects = await detectRectangles('document.jpg');
102
+ const doc = await detectDocument('photo.jpg'); // DocumentBounds | null
94
103
 
95
104
  // Classify image content
96
- const labels = await classify('photo.jpg')
105
+ const labels = await classify('photo.jpg');
97
106
 
98
107
  // Layout inference — unified reading-order-sorted representation
99
- const layout = inferLayout({ textBlocks: blocks, faces, barcodes: codes })
100
- // layout is LayoutBlock[] — ready to feed into a Markdown renderer or LLM context
108
+ const layout = inferLayout({ textBlocks: blocks, faces, barcodes: codes });
101
109
  ```
102
110
 
103
111
  ### Layout inference
@@ -134,116 +142,173 @@ for (const block of layout) {
134
142
 
135
143
  > **Note:** Layout inference is a heuristic layer. It does not understand multi-column layouts or rotated text. Treat it as structured input for downstream tools, not as ground truth.
136
144
 
137
- ## API
145
+ ---
138
146
 
139
- ### `ocr(imagePath, options?)`
147
+ ## API — Markdown pipeline (VisionScribe)
140
148
 
141
- Extracts text from an image.
149
+ `VisionScribe` converts an image or PDF to Markdown by combining Apple Vision OCR with a local LLM (via Ollama). The LLM never sees the image — it only formats text that Vision already extracted. This keeps image processing local and reduces the risk of vision-model hallucinations, but Markdown reconstruction is still best-effort and depends on the local model and document complexity.
142
150
 
143
- | Parameter | Type | Default | Description |
144
- |-----------|------|---------|-------------|
145
- | `imagePath` | `string` | — | Path to image (PNG, JPG, JPEG, WEBP) |
146
- | `options.format` | `'text' \| 'blocks'` | `'text'` | Plain text or structured blocks with coordinates |
151
+ ### Prerequisites
147
152
 
148
- Returns `Promise<string>` or `Promise<VisionBlock[]>`.
149
-
150
- ```ts
151
- interface VisionBlock {
152
- text: string
153
- x: number // 0–1 from left
154
- y: number // 0–1 from top
155
- width: number // 0–1
156
- height: number // 0–1
157
- }
153
+ ```bash
154
+ brew install ollama
155
+ ollama serve # keep this running
156
+ ollama pull mistral-nemo
158
157
  ```
159
158
 
160
- ---
159
+ ### Quick start
161
160
 
162
- ### `detectFaces(imagePath)`
161
+ ```ts
162
+ import { VisionScribe } from 'macos-vision';
163
163
 
164
- Detects human faces and returns their bounding boxes.
164
+ const scribe = new VisionScribe();
165
+ const markdown = await scribe.toMarkdown('receipt.png');
166
+ console.log(markdown);
167
+ ```
168
+
169
+ For a narrower import surface that pulls in only the markdown sub-module:
165
170
 
166
171
  ```ts
167
- interface Face {
168
- x: number; y: number; width: number; height: number
169
- confidence: number // 0–1
170
- }
172
+ import { VisionScribe } from 'macos-vision/markdown';
171
173
  ```
172
174
 
173
- ---
175
+ ### How it works
174
176
 
175
- ### `detectBarcodes(imagePath)`
177
+ ```
178
+ Image / PDF
179
+
180
+
181
+ Apple Vision OCR ← macOS native text extraction
182
+ │ VisionBlock[] per page
183
+
184
+ Per-page layout inference ← each page processed independently (page-local coords)
185
+ │ paragraphId, lineId, y
186
+
187
+ Chunker ← batches paragraphs to fit the LLM output window
188
+ │ ParagraphGroup[][]
189
+
190
+ Ollama /api/chat ← system prompt as role:"system", OCR text as role:"user"
191
+ │ temperature=0, top_p=1, num_predict=-1
192
+
193
+ Markdown string ← chunk results joined with blank lines
194
+ ```
176
195
 
177
- Detects barcodes and QR codes and decodes their payload.
196
+ The LLM never sees the raw image; it only formats text that Apple Vision has already extracted. The system prompt asks the model to preserve the source text, avoid summarising, and avoid adding content. OCR text is wrapped in `<ocr_source>` tags so the model is less likely to treat document text as user instructions. Per-page processing keeps paragraph coordinates from different pages from being mixed.
178
197
 
179
- ```ts
180
- interface Barcode {
181
- type: string // e.g. 'org.iso.QRCode', 'org.gs1.EAN-13'
182
- value: string // decoded content
183
- x: number; y: number; width: number; height: number
184
- }
185
- ```
198
+ ### `new VisionScribe(options?)`
186
199
 
187
- ---
200
+ | Option | Type | Default | Description |
201
+ |---|---|---|---|
202
+ | `model` | `string` | `'mistral-nemo'` | Ollama model name |
203
+ | `ollamaUrl` | `string` | `'http://localhost:11434'` | Base URL of the Ollama server |
204
+ | `skipPing` | `boolean` | `false` | Skip per-call Ollama health check (useful in batch loops) |
205
+ | `chunkSizeTokens` | `number` | `1800` | Max estimated output tokens per LLM chunk. Lower = more chunks (safer for small models); higher = fewer calls but risks hitting model output limits |
206
+
207
+ ### `scribe.toMarkdown(imagePath)`
188
208
 
189
- ### `detectRectangles(imagePath)`
209
+ - Accepts PNG, JPEG, HEIC, HEIF, TIFF, GIF, BMP, WebP and **PDF**
210
+ - Returns an empty string `''` if no text is detected
211
+ - Throws `OllamaUnavailableError` if the Ollama server is not reachable (unless `skipPing: true`)
190
212
 
191
- Finds rectangular shapes (documents, tables, cards, forms).
213
+ ### Batch processing
192
214
 
193
215
  ```ts
194
- interface Rectangle {
195
- topLeft: [number, number]; topRight: [number, number]
196
- bottomLeft: [number, number]; bottomRight: [number, number]
197
- confidence: number
216
+ import { VisionScribe, OllamaUnavailableError } from 'macos-vision';
217
+
218
+ const scribe = new VisionScribe({ skipPing: true });
219
+
220
+ for (const file of files) {
221
+ try {
222
+ const md = await scribe.toMarkdown(file);
223
+ // …
224
+ } catch (e) {
225
+ if (e instanceof OllamaUnavailableError) {
226
+ console.error(e.message);
227
+ break;
228
+ }
229
+ throw e;
230
+ }
198
231
  }
199
232
  ```
200
233
 
234
+ ### Known limitations
235
+
236
+ - **Local model fidelity**: small models (`mistral-nemo`, `gemma`) may occasionally summarise or paraphrase long, dense documents. Larger models (`llama3.1:70b`, `qwen2.5:32b`) produce significantly better fidelity.
237
+ - **Tables**: multi-column table layouts are partially supported. OCR reads cells in reading order but the LLM may not always reconstruct correct Markdown table syntax.
238
+ - **Images / charts**: non-textual content (photos, diagrams, charts) is ignored — only text blocks extracted by Apple Vision are processed.
239
+ - **Markdown fidelity**: the prompt strongly asks for faithful reconstruction, but LLM output is not a cryptographic or deterministic guarantee. Review important legal, financial, or compliance documents before relying on the generated Markdown.
240
+
201
241
  ---
202
242
 
203
- ### `detectDocument(imagePath)`
243
+ ## Migrating from `macos-vision-md`
204
244
 
205
- Finds the boundary of a document in a photo (e.g. paper on a desk). Returns `null` if no document is found.
245
+ The standalone [`macos-vision-md`](https://github.com/woladi/macos-vision-md) package has been merged into `macos-vision` as of v2.0.0. The old package will keep working as a thin re-export shim, but new projects should depend on `macos-vision` directly.
206
246
 
207
- ```ts
208
- interface DocumentBounds {
209
- topLeft: [number, number]; topRight: [number, number]
210
- bottomLeft: [number, number]; bottomRight: [number, number]
211
- confidence: number
212
- }
247
+ ```diff
248
+ - import { VisionScribe } from 'macos-vision-md';
249
+ + import { VisionScribe } from 'macos-vision';
213
250
  ```
214
251
 
252
+ ```diff
253
+ - macos-vision-md invoice.pdf -o notes.md
254
+ + macos-vision --markdown invoice.pdf -o notes.md
255
+ ```
256
+
257
+ The `VisionScribe` API, the system prompt, and the chunking strategy are unchanged. `OllamaUnavailableError`, `VisionScribeOptions`, and `ParagraphGroup` are now exported from `macos-vision`.
258
+
215
259
  ---
216
260
 
217
- ### `classify(imagePath)`
261
+ ## API reference — types
262
+
263
+ ### `ocr(imagePath, options?)`
218
264
 
219
- Returns top image classification labels with confidence scores.
265
+ | Parameter | Type | Default | Description |
266
+ |-----------|------|---------|-------------|
267
+ | `imagePath` | `string` | — | Path to image (PNG, JPG, JPEG, WEBP) or PDF |
268
+ | `options.format` | `'text' \| 'blocks'` | `'text'` | Plain text or structured blocks with coordinates |
269
+
270
+ Returns `Promise<string>` or `Promise<VisionBlock[]>`.
220
271
 
221
272
  ```ts
222
- interface Classification {
223
- identifier: string // e.g. 'document', 'outdoor', 'animal'
224
- confidence: number // 0–1
273
+ interface VisionBlock {
274
+ text: string
275
+ x: number // 0–1 from left
276
+ y: number // 0–1 from top
277
+ width: number // 0–1
278
+ height: number // 0–1
279
+ confidence: number
280
+ page?: number // 0-based, only for PDFs
225
281
  }
226
282
  ```
227
283
 
284
+ ### `detectFaces(imagePath)` / `detectBarcodes(imagePath)` / `detectRectangles(imagePath)` / `detectDocument(imagePath)` / `classify(imagePath)`
285
+
286
+ See `src/index.ts` for full type declarations.
287
+
228
288
  ---
229
289
 
230
290
  ## Why macos-vision?
231
291
 
232
292
  | | macos-vision | Tesseract.js | Cloud APIs |
233
293
  |---|---|---|---|
234
- | Offline | ✅ | ✅ | ❌ |
294
+ | Offline OCR | ✅ | ✅ | ❌ |
295
+ | Offline image → Markdown | ✅ (with local Ollama) | ❌ | ❌ |
235
296
  | No API key | ✅ | ✅ | ❌ |
236
297
  | Native speed | ✅ | ❌ | — |
237
298
  | Zero runtime deps | ✅ | ❌ | ❌ |
238
299
  | OCR with bounding boxes | ✅ | ✅ | ✅ |
239
- | Face detection | ✅ | ❌ | ✅ |
240
- | Barcode / QR | ✅ | ❌ | ✅ |
241
- | Document detection | ✅ | ❌ | ✅ |
300
+ | Face / barcode / document detection | ✅ | ❌ | ✅ |
242
301
  | Image classification | ✅ | ❌ | ✅ |
243
302
  | macOS only | ✅ | ❌ | ❌ |
244
303
 
245
304
  Apple Vision is the same engine used by macOS Spotlight, Live Text, and Shortcuts — highly optimized and accurate.
246
305
 
306
+ ### OCR evaluation notes
307
+
308
+ In internal tests on anonymized scanned contracts, forms, declarations, and UI screenshots, Apple Vision OCR produced fewer OCR artifacts than Tesseract in most cases. The strongest gains were on multi-column contract-style scans, where Apple Vision preserved substantially more usable text with far fewer artifacts. On simpler UI screenshots, both engines performed similarly.
309
+
310
+ These results are directional rather than a public benchmark suite. The corpus is not included in this repository, and future benchmark fixtures should use synthetic or public-domain documents only.
311
+
247
312
  ## License
248
313
 
249
314
  MIT
package/bin/pdf-helper CHANGED
Binary file
package/bin/vision-helper CHANGED
Binary file
package/dist/cli.js CHANGED
@@ -1,91 +1,154 @@
1
1
  #!/usr/bin/env node
2
- import { resolve } from 'path';
2
+ import { resolve, dirname, basename, extname, join } from 'path';
3
+ import { writeFile } from 'fs/promises';
3
4
  import { ocr, detectFaces, detectBarcodes, detectRectangles, detectDocument, classify, } from './index.js';
4
5
  const USAGE = `
5
- Usage: vision-cli [options] <image>
6
+ Usage: macos-vision [options] <image-or-pdf>
6
7
 
7
- Options:
8
- --ocr OCR — plain text (default)
9
- --blocks OCR — structured blocks with coordinates
10
- --faces Face detection
11
- --barcodes Barcode & QR code detection
12
- --rectangles Rectangle detection
13
- --document Document boundary detection
14
- --classify Image classification
15
- --all Run all of the above
8
+ Vision options:
9
+ --ocr OCR — plain text (default)
10
+ --blocks OCR — structured blocks with coordinates
11
+ --faces Face detection
12
+ --barcodes Barcode & QR code detection
13
+ --rectangles Rectangle detection
14
+ --document Document boundary detection
15
+ --classify Image classification
16
+ --all Run all of the above
16
17
 
17
- --help Show this help
18
+ Markdown options (requires Ollama running locally):
19
+ --markdown Convert image/PDF to Markdown via VisionScribe + Ollama
20
+ --model <name> Ollama model name (default: mistral-nemo)
21
+ --ollama-url <url> Ollama base URL (default: http://localhost:11434)
22
+ -o, --output <path> Write Markdown to specified file
23
+ --stdout Print Markdown to stdout instead of a file
24
+
25
+ --help Show this help
18
26
 
19
27
  Examples:
20
- vision-cli photo.jpg
21
- vision-cli --blocks --faces photo.jpg
22
- vision-cli --all photo.jpg
28
+ macos-vision photo.jpg
29
+ macos-vision --blocks --faces photo.jpg
30
+ macos-vision --all photo.jpg
31
+ macos-vision --markdown invoice.pdf -o notes.md
32
+ macos-vision --markdown receipt.jpg --stdout
23
33
  `.trim();
24
34
  const rawArgs = process.argv.slice(2);
25
35
  if (rawArgs.includes('--help') || rawArgs.length === 0) {
26
36
  console.log(USAGE);
27
37
  process.exit(0);
28
38
  }
29
- const flags = new Set(rawArgs.filter((a) => a.startsWith('--')));
30
- const fileArgs = rawArgs.filter((a) => !a.startsWith('--'));
39
+ // Strip value-bearing options first so the remaining tokens are either
40
+ // boolean flags (`--something`) or positional file paths.
41
+ function takeOpt(name, args) {
42
+ const i = args.indexOf(name);
43
+ if (i === -1)
44
+ return undefined;
45
+ const v = args[i + 1];
46
+ args.splice(i, 2);
47
+ return v;
48
+ }
49
+ const argv = [...rawArgs];
50
+ const model = takeOpt('--model', argv);
51
+ const ollamaUrl = takeOpt('--ollama-url', argv);
52
+ const outPath = takeOpt('-o', argv) ?? takeOpt('--output', argv);
53
+ const flags = new Set(argv.filter((a) => a.startsWith('--')));
54
+ const fileArgs = argv.filter((a) => !a.startsWith('-'));
31
55
  if (!fileArgs[0]) {
32
- console.error('Error: no image path provided.\n');
56
+ console.error('Error: no image or PDF path provided.\n');
33
57
  console.log(USAGE);
34
58
  process.exit(1);
35
59
  }
36
- const imagePath = resolve(fileArgs[0]);
37
- const runAll = flags.has('--all');
38
- const runOcr = runAll || flags.has('--ocr');
39
- const runBlocks = runAll || flags.has('--blocks');
40
- const runFaces = runAll || flags.has('--faces');
41
- const runBarcodes = runAll || flags.has('--barcodes');
42
- const runRects = runAll || flags.has('--rectangles');
43
- const runDoc = runAll || flags.has('--document');
44
- const runClassify = runAll || flags.has('--classify');
45
- // Default: OCR text when no feature flag is given
46
- const anyFeatureFlag = runAll ||
47
- flags.has('--ocr') ||
48
- flags.has('--blocks') ||
49
- flags.has('--faces') ||
50
- flags.has('--barcodes') ||
51
- flags.has('--rectangles') ||
52
- flags.has('--document') ||
53
- flags.has('--classify');
54
- const useDefault = !anyFeatureFlag;
55
- async function main() {
56
- try {
57
- if (useDefault || runOcr) {
58
- const text = await ocr(imagePath);
59
- console.log(text);
60
- }
61
- if (runBlocks) {
62
- const blocks = (await ocr(imagePath, { format: 'blocks' }));
63
- console.log(JSON.stringify(blocks, null, 2));
64
- }
65
- if (runFaces) {
66
- const faces = (await detectFaces(imagePath));
67
- console.log(JSON.stringify(faces, null, 2));
60
+ const inputPath = resolve(fileArgs[0]);
61
+ // ─── Markdown pipeline ─────────────────────────────────────────────────────────────
62
+ if (flags.has('--markdown')) {
63
+ const toStdout = flags.has('--stdout');
64
+ const opts = {};
65
+ if (model)
66
+ opts.model = model;
67
+ if (ollamaUrl)
68
+ opts.ollamaUrl = ollamaUrl;
69
+ (async () => {
70
+ const { VisionScribe, OllamaUnavailableError } = await import('./markdown/index.js');
71
+ const scribe = new VisionScribe(opts);
72
+ if (!toStdout)
73
+ process.stderr.write(`Converting ${fileArgs[0]}…\n`);
74
+ let markdown;
75
+ try {
76
+ markdown = await scribe.toMarkdown(inputPath);
68
77
  }
69
- if (runBarcodes) {
70
- const barcodes = (await detectBarcodes(imagePath));
71
- console.log(JSON.stringify(barcodes, null, 2));
78
+ catch (err) {
79
+ if (err instanceof OllamaUnavailableError) {
80
+ console.error(err.message);
81
+ process.exit(2);
82
+ }
83
+ throw err;
72
84
  }
73
- if (runRects) {
74
- const rectangles = (await detectRectangles(imagePath));
75
- console.log(JSON.stringify(rectangles, null, 2));
85
+ if (toStdout) {
86
+ process.stdout.write(markdown);
87
+ return;
76
88
  }
77
- if (runDoc) {
78
- const doc = (await detectDocument(imagePath));
79
- console.log(JSON.stringify(doc, null, 2));
89
+ const finalPath = outPath ??
90
+ join(dirname(inputPath), basename(inputPath, extname(inputPath)) + '.md');
91
+ await writeFile(finalPath, markdown, 'utf8');
92
+ process.stderr.write(`Saved: ${finalPath}\n`);
93
+ })().catch((err) => {
94
+ console.error(err instanceof Error ? err.message : String(err));
95
+ process.exit(1);
96
+ });
97
+ }
98
+ else {
99
+ // ─── Vision pipeline (OCR / detections / classification) ───────────────────────
100
+ const runAll = flags.has('--all');
101
+ const runOcr = runAll || flags.has('--ocr');
102
+ const runBlocks = runAll || flags.has('--blocks');
103
+ const runFaces = runAll || flags.has('--faces');
104
+ const runBarcodes = runAll || flags.has('--barcodes');
105
+ const runRects = runAll || flags.has('--rectangles');
106
+ const runDoc = runAll || flags.has('--document');
107
+ const runClassify = runAll || flags.has('--classify');
108
+ // Default: OCR text when no feature flag is given
109
+ const anyFeatureFlag = runAll ||
110
+ flags.has('--ocr') ||
111
+ flags.has('--blocks') ||
112
+ flags.has('--faces') ||
113
+ flags.has('--barcodes') ||
114
+ flags.has('--rectangles') ||
115
+ flags.has('--document') ||
116
+ flags.has('--classify');
117
+ const useDefault = !anyFeatureFlag;
118
+ (async () => {
119
+ try {
120
+ if (useDefault || runOcr) {
121
+ const text = await ocr(inputPath);
122
+ console.log(text);
123
+ }
124
+ if (runBlocks) {
125
+ const blocks = (await ocr(inputPath, { format: 'blocks' }));
126
+ console.log(JSON.stringify(blocks, null, 2));
127
+ }
128
+ if (runFaces) {
129
+ const faces = (await detectFaces(inputPath));
130
+ console.log(JSON.stringify(faces, null, 2));
131
+ }
132
+ if (runBarcodes) {
133
+ const barcodes = (await detectBarcodes(inputPath));
134
+ console.log(JSON.stringify(barcodes, null, 2));
135
+ }
136
+ if (runRects) {
137
+ const rectangles = (await detectRectangles(inputPath));
138
+ console.log(JSON.stringify(rectangles, null, 2));
139
+ }
140
+ if (runDoc) {
141
+ const doc = (await detectDocument(inputPath));
142
+ console.log(JSON.stringify(doc, null, 2));
143
+ }
144
+ if (runClassify) {
145
+ const labels = (await classify(inputPath));
146
+ console.log(JSON.stringify(labels, null, 2));
147
+ }
80
148
  }
81
- if (runClassify) {
82
- const labels = (await classify(imagePath));
83
- console.log(JSON.stringify(labels, null, 2));
149
+ catch (error) {
150
+ console.error('Error:', error);
151
+ process.exit(1);
84
152
  }
85
- }
86
- catch (error) {
87
- console.error('Error:', error);
88
- process.exit(1);
89
- }
153
+ })();
90
154
  }
91
- main();
package/dist/index.d.ts CHANGED
@@ -112,3 +112,5 @@ export interface Classification {
112
112
  export declare function classify(imagePath: string): Promise<Classification[]>;
113
113
  export type { BlockKind, BaseBlock, TextBlock, FaceBlock, BarcodeBlock, RectangleBlock, DocumentBlock, LayoutBlock, InferLayoutInput, } from './layout.js';
114
114
  export { inferLayout, sortBlocksByReadingOrder } from './layout.js';
115
+ export { VisionScribe, OllamaUnavailableError } from './markdown/index.js';
116
+ export type { VisionScribeOptions, ParagraphGroup } from './markdown/index.js';
package/dist/index.js CHANGED
@@ -15,7 +15,7 @@ async function run(flag, imagePath) {
15
15
  });
16
16
  return stdout;
17
17
  }
18
- // ─── PDF helpers ─────────────────────────────────────────────────────────────
18
+ // ─── PDF helpers ─────────────────────────────────────────────────────
19
19
  /**
20
20
  * Returns true if the file at `filePath` is a PDF.
21
21
  * Uses extension as a fast path; falls back to magic bytes (`%PDF`) for
@@ -75,11 +75,11 @@ async function ocrPdf(pdfPath, format) {
75
75
  export async function ocr(imagePath, options = {}) {
76
76
  const absPath = resolve(imagePath);
77
77
  const { format = 'text' } = options;
78
- // ── PDF fast-path: rasterize via sips, then OCR each page ────────────────
78
+ // ── PDF fast-path: rasterize via sips, then OCR each page ────────────
79
79
  if (await isPdf(absPath)) {
80
80
  return ocrPdf(absPath, format);
81
81
  }
82
- // ── Existing image path (unchanged) ──────────────────────────────────────
82
+ // ── Existing image path (unchanged) ─────────────────────────────────
83
83
  if (format === 'blocks') {
84
84
  const { stdout } = await execFileAsync(BIN_PATH, ['--json', absPath], {
85
85
  timeout: BINARY_TIMEOUT_MS,
@@ -128,3 +128,5 @@ export async function classify(imagePath) {
128
128
  return raw;
129
129
  }
130
130
  export { inferLayout, sortBlocksByReadingOrder } from './layout.js';
131
+ // ─── Markdown pipeline (VisionScribe) ──────────────────────────────────────────
132
+ export { VisionScribe, OllamaUnavailableError } from './markdown/index.js';
@@ -0,0 +1,11 @@
1
+ import type { ParagraphGroup } from './prompt.js';
2
+ export declare function estimateTokens(text: string): number;
3
+ /**
4
+ * Split an array of paragraphs into chunks where each chunk's estimated prompt
5
+ * token count stays within `chunkSizeTokens`. Paragraph boundaries are never
6
+ * split — chunks always break between `ParagraphGroup` objects.
7
+ *
8
+ * A paragraph whose estimated token count exceeds the budget on its own is
9
+ * emitted as a singleton chunk with a warning.
10
+ */
11
+ export declare function chunkParagraphs(paragraphs: ParagraphGroup[], chunkSizeTokens: number): ParagraphGroup[][];