macos-vision 1.1.0 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -1,6 +1,6 @@
1
1
  # macos-vision
2
2
 
3
- > Apple Vision for Node.js — native, fast, offline, no API keys required.
3
+ > Apple Vision for Node.js — native, fast, offline. Now with an optional Ollama-driven Markdown pipeline.
4
4
 
5
5
  Uses macOS's built-in [Vision framework](https://developer.apple.com/documentation/vision) via a compiled Swift binary. Works completely offline. No cloud services, no API keys, no Python, zero runtime dependencies.
6
6
 
@@ -8,11 +8,8 @@ Uses macOS's built-in [Vision framework](https://developer.apple.com/documentati
8
8
 
9
9
  - macOS 12+
10
10
  - Node.js 18+
11
- - Xcode Command Line Tools
12
-
13
- ```bash
14
- xcode-select --install
15
- ```
11
+ - Xcode Command Line Tools (`xcode-select --install`)
12
+ - [Ollama](https://ollama.com) running locally — only if you use the Markdown pipeline
16
13
 
17
14
  ## Installation
18
15
 
@@ -20,18 +17,18 @@ xcode-select --install
20
17
  npm install macos-vision
21
18
  ```
22
19
 
23
- The native Swift binary is compiled automatically on install.
24
-
25
- ## What this is (and isn't)
26
-
27
- `macos-vision` gives you **raw Apple Vision results** — text, coordinates, bounding boxes, labels.
20
+ The native Swift binaries (`vision-helper`, `pdf-helper`) are compiled automatically on install.
28
21
 
29
- It is **not** a document pipeline. It does not:
30
- - Convert PDFs or images to Markdown
31
- - Understand document structure (headings, tables, paragraphs)
32
- - Chain multiple detections into a final report
22
+ ## What you get
33
23
 
34
- For those use cases, use the raw output as input to an LLM or a post-processing layer of your own.
24
+ | Capability | Engine | Network |
25
+ |---|---|---|
26
+ | OCR (text + bounding boxes) | Apple Vision | offline |
27
+ | Face / barcode / rectangle / document detection | Apple Vision | offline |
28
+ | Image classification | Apple Vision | offline |
29
+ | Layout inference (lines, paragraphs, reading order) | heuristic in TypeScript | offline |
30
+ | PDF rasterization | PDFKit (`pdf-helper`) | offline |
31
+ | **Image / PDF → Markdown** | Apple Vision OCR + local LLM via Ollama | local LLM call |
35
32
 
36
33
  ---
37
34
 
@@ -44,60 +41,71 @@ npx macos-vision photo.jpg
44
41
  # Structured OCR blocks with bounding boxes
45
42
  npx macos-vision --blocks photo.jpg
46
43
 
47
- # Detect faces
44
+ # Detections
48
45
  npx macos-vision --faces photo.jpg
49
-
50
- # Detect barcodes and QR codes
51
46
  npx macos-vision --barcodes photo.jpg
52
-
53
- # Detect rectangular shapes
54
47
  npx macos-vision --rectangles photo.jpg
55
-
56
- # Find document boundary
57
48
  npx macos-vision --document photo.jpg
58
-
59
- # Classify image content
60
49
  npx macos-vision --classify photo.jpg
61
50
 
62
51
  # Run all detections at once
63
52
  npx macos-vision --all photo.jpg
53
+
54
+ # Image / PDF → Markdown via VisionScribe + Ollama
55
+ npx macos-vision --markdown invoice.pdf -o notes.md
56
+ npx macos-vision --markdown receipt.jpg --stdout
57
+ npx macos-vision --markdown scan.png --model llama3.2
64
58
  ```
65
59
 
66
- Multiple flags can be combined: `npx macos-vision --blocks --faces --classify photo.jpg`
60
+ Multiple Vision flags can be combined: `npx macos-vision --blocks --faces --classify photo.jpg`. Structured results are printed as JSON to stdout.
61
+
62
+ ### CLI flags
67
63
 
68
- Structured results are printed as JSON to stdout.
64
+ | Flag | Description |
65
+ |---|---|
66
+ | `--ocr` | Plain text OCR (default when no flag is given) |
67
+ | `--blocks` | OCR with bounding boxes (JSON) |
68
+ | `--faces` / `--barcodes` / `--rectangles` / `--document` / `--classify` | Vision detections (JSON) |
69
+ | `--all` | Run every Vision detection at once |
70
+ | `--markdown` | Convert image / PDF to Markdown via VisionScribe + Ollama |
71
+ | `--model <name>` | Ollama model (default: `mistral-nemo`). Only used with `--markdown` |
72
+ | `--ollama-url <url>` | Ollama base URL (default: `http://localhost:11434`). Only used with `--markdown` |
73
+ | `-o`, `--output <path>` | Write Markdown to a file. Only used with `--markdown` |
74
+ | `--stdout` | Print Markdown to stdout instead of a file. Only used with `--markdown` |
75
+ | `--help` | Show usage |
69
76
 
70
77
  ---
71
78
 
72
- ## API
79
+ ## API — Vision
73
80
 
74
81
  ```js
75
- import { ocr, detectFaces, detectBarcodes, detectRectangles, detectDocument, classify } from 'macos-vision'
82
+ import {
83
+ ocr,
84
+ detectFaces,
85
+ detectBarcodes,
86
+ detectRectangles,
87
+ detectDocument,
88
+ classify,
89
+ inferLayout,
90
+ } from 'macos-vision';
76
91
 
77
92
  // OCR — plain text
78
- const text = await ocr('photo.jpg')
93
+ const text = await ocr('photo.jpg');
79
94
 
80
95
  // OCR — structured blocks with bounding boxes
81
- const blocks = await ocr('photo.jpg', { format: 'blocks' })
82
-
83
- // Detect faces
84
- const faces = await detectFaces('photo.jpg')
85
-
86
- // Detect barcodes and QR codes
87
- const codes = await detectBarcodes('invoice.jpg')
88
-
89
- // Detect rectangular shapes (tables, forms, cards)
90
- const rects = await detectRectangles('document.jpg')
96
+ const blocks = await ocr('photo.jpg', { format: 'blocks' });
91
97
 
92
- // Find document boundary in a photo
93
- const doc = await detectDocument('photo.jpg') // DocumentBounds | null
98
+ // Detect faces / barcodes / rectangles / document boundary
99
+ const faces = await detectFaces('photo.jpg');
100
+ const codes = await detectBarcodes('invoice.jpg');
101
+ const rects = await detectRectangles('document.jpg');
102
+ const doc = await detectDocument('photo.jpg'); // DocumentBounds | null
94
103
 
95
104
  // Classify image content
96
- const labels = await classify('photo.jpg')
105
+ const labels = await classify('photo.jpg');
97
106
 
98
107
  // Layout inference — unified reading-order-sorted representation
99
- const layout = inferLayout({ textBlocks: blocks, faces, barcodes: codes })
100
- // layout is LayoutBlock[] — ready to feed into a Markdown renderer or LLM context
108
+ const layout = inferLayout({ textBlocks: blocks, faces, barcodes: codes });
101
109
  ```
102
110
 
103
111
  ### Layout inference
@@ -134,111 +142,161 @@ for (const block of layout) {
134
142
 
135
143
  > **Note:** Layout inference is a heuristic layer. It does not understand multi-column layouts or rotated text. Treat it as structured input for downstream tools, not as ground truth.
136
144
 
137
- ## API
145
+ ---
138
146
 
139
- ### `ocr(imagePath, options?)`
147
+ ## API — Markdown pipeline (VisionScribe)
140
148
 
141
- Extracts text from an image.
149
+ `VisionScribe` converts an image or PDF to Markdown by combining Apple Vision OCR with a local LLM (via Ollama). The LLM never sees the image — it only formats text that Vision already extracted, which keeps the pipeline deterministic and prevents the hallucinations typical of cloud vision APIs.
142
150
 
143
- | Parameter | Type | Default | Description |
144
- |-----------|------|---------|-------------|
145
- | `imagePath` | `string` | — | Path to image (PNG, JPG, JPEG, WEBP) |
146
- | `options.format` | `'text' \| 'blocks'` | `'text'` | Plain text or structured blocks with coordinates |
147
-
148
- Returns `Promise<string>` or `Promise<VisionBlock[]>`.
151
+ ### Prerequisites
149
152
 
150
- ```ts
151
- interface VisionBlock {
152
- text: string
153
- x: number // 0–1 from left
154
- y: number // 0–1 from top
155
- width: number // 0–1
156
- height: number // 0–1
157
- }
153
+ ```bash
154
+ brew install ollama
155
+ ollama serve # keep this running
156
+ ollama pull mistral-nemo
158
157
  ```
159
158
 
160
- ---
159
+ ### Quick start
161
160
 
162
- ### `detectFaces(imagePath)`
161
+ ```ts
162
+ import { VisionScribe } from 'macos-vision';
163
+
164
+ const scribe = new VisionScribe();
165
+ const markdown = await scribe.toMarkdown('receipt.png');
166
+ console.log(markdown);
167
+ ```
163
168
 
164
- Detects human faces and returns their bounding boxes.
169
+ For a narrower import surface that pulls in only the markdown sub-module:
165
170
 
166
171
  ```ts
167
- interface Face {
168
- x: number; y: number; width: number; height: number
169
- confidence: number // 0–1
170
- }
172
+ import { VisionScribe } from 'macos-vision/markdown';
171
173
  ```
172
174
 
173
- ---
175
+ ### How it works
174
176
 
175
- ### `detectBarcodes(imagePath)`
177
+ ```
178
+ Image / PDF
179
+
180
+
181
+ Apple Vision OCR ← macOS native, deterministic, zero hallucination
182
+ │ VisionBlock[] per page
183
+
184
+ Per-page layout inference ← each page processed independently (page-local coords)
185
+ │ paragraphId, lineId, y
186
+
187
+ Chunker ← batches paragraphs to fit the LLM output window
188
+ │ ParagraphGroup[][]
189
+
190
+ Ollama /api/chat ← system prompt as role:"system", OCR text as role:"user"
191
+ │ temperature=0, top_p=1, num_predict=-1
192
+
193
+ Markdown string ← chunk results joined with blank lines
194
+ ```
176
195
 
177
- Detects barcodes and QR codes and decodes their payload.
196
+ The LLM never sees the raw image; it only formats text that Apple Vision has already extracted. The system prompt instructs the model to act as a high-fidelity document parser and explicitly forbids summarising, paraphrasing, or adding content. OCR text is wrapped in `<ocr_source>` tags so the model cannot mistake it for a user asking a question. Per-page processing keeps paragraph coordinates from different pages from being mixed.
178
197
 
179
- ```ts
180
- interface Barcode {
181
- type: string // e.g. 'org.iso.QRCode', 'org.gs1.EAN-13'
182
- value: string // decoded content
183
- x: number; y: number; width: number; height: number
184
- }
185
- ```
198
+ ### `new VisionScribe(options?)`
186
199
 
187
- ---
200
+ | Option | Type | Default | Description |
201
+ |---|---|---|---|
202
+ | `model` | `string` | `'mistral-nemo'` | Ollama model name |
203
+ | `ollamaUrl` | `string` | `'http://localhost:11434'` | Base URL of the Ollama server |
204
+ | `skipPing` | `boolean` | `false` | Skip per-call Ollama health check (useful in batch loops) |
205
+ | `chunkSizeTokens` | `number` | `1800` | Max estimated output tokens per LLM chunk. Lower = more chunks (safer for small models); higher = fewer calls but risks hitting model output limits |
206
+
207
+ ### `scribe.toMarkdown(imagePath)`
188
208
 
189
- ### `detectRectangles(imagePath)`
209
+ - Accepts PNG, JPEG, HEIC, HEIF, TIFF, GIF, BMP, WebP and **PDF**
210
+ - Returns an empty string `''` if no text is detected
211
+ - Throws `OllamaUnavailableError` if the Ollama server is not reachable (unless `skipPing: true`)
190
212
 
191
- Finds rectangular shapes (documents, tables, cards, forms).
213
+ ### Batch processing
192
214
 
193
215
  ```ts
194
- interface Rectangle {
195
- topLeft: [number, number]; topRight: [number, number]
196
- bottomLeft: [number, number]; bottomRight: [number, number]
197
- confidence: number
216
+ import { VisionScribe, OllamaUnavailableError } from 'macos-vision';
217
+
218
+ const scribe = new VisionScribe({ skipPing: true });
219
+
220
+ for (const file of files) {
221
+ try {
222
+ const md = await scribe.toMarkdown(file);
223
+ // …
224
+ } catch (e) {
225
+ if (e instanceof OllamaUnavailableError) {
226
+ console.error(e.message);
227
+ break;
228
+ }
229
+ throw e;
230
+ }
198
231
  }
199
232
  ```
200
233
 
234
+ ### Known limitations
235
+
236
+ - **Local model fidelity**: small models (`mistral-nemo`, `gemma`) may occasionally summarise or paraphrase long, dense documents. Larger models (`llama3.1:70b`, `qwen2.5:32b`) produce significantly better fidelity.
237
+ - **Tables**: multi-column table layouts are partially supported. OCR reads cells in reading order but the LLM may not always reconstruct correct Markdown table syntax.
238
+ - **Images / charts**: non-textual content (photos, diagrams, charts) is ignored — only text blocks extracted by Apple Vision are processed.
239
+
201
240
  ---
202
241
 
203
- ### `detectDocument(imagePath)`
242
+ ## Migrating from `macos-vision-md`
204
243
 
205
- Finds the boundary of a document in a photo (e.g. paper on a desk). Returns `null` if no document is found.
244
+ The standalone [`macos-vision-md`](https://github.com/woladi/macos-vision-md) package has been merged into `macos-vision` as of v2.0.0. The old package will keep working as a thin re-export shim, but new projects should depend on `macos-vision` directly.
206
245
 
207
- ```ts
208
- interface DocumentBounds {
209
- topLeft: [number, number]; topRight: [number, number]
210
- bottomLeft: [number, number]; bottomRight: [number, number]
211
- confidence: number
212
- }
246
+ ```diff
247
+ - import { VisionScribe } from 'macos-vision-md';
248
+ + import { VisionScribe } from 'macos-vision';
249
+ ```
250
+
251
+ ```diff
252
+ - macos-vision-md invoice.pdf -o notes.md
253
+ + macos-vision --markdown invoice.pdf -o notes.md
213
254
  ```
214
255
 
256
+ The `VisionScribe` API, the system prompt, and the chunking strategy are unchanged. `OllamaUnavailableError`, `VisionScribeOptions`, and `ParagraphGroup` are now exported from `macos-vision`.
257
+
215
258
  ---
216
259
 
217
- ### `classify(imagePath)`
260
+ ## API reference — types
218
261
 
219
- Returns top image classification labels with confidence scores.
262
+ ### `ocr(imagePath, options?)`
263
+
264
+ | Parameter | Type | Default | Description |
265
+ |-----------|------|---------|-------------|
266
+ | `imagePath` | `string` | — | Path to image (PNG, JPG, JPEG, WEBP) or PDF |
267
+ | `options.format` | `'text' \| 'blocks'` | `'text'` | Plain text or structured blocks with coordinates |
268
+
269
+ Returns `Promise<string>` or `Promise<VisionBlock[]>`.
220
270
 
221
271
  ```ts
222
- interface Classification {
223
- identifier: string // e.g. 'document', 'outdoor', 'animal'
224
- confidence: number // 0–1
272
+ interface VisionBlock {
273
+ text: string
274
+ x: number // 0–1 from left
275
+ y: number // 0–1 from top
276
+ width: number // 0–1
277
+ height: number // 0–1
278
+ confidence: number
279
+ page?: number // 0-based, only for PDFs
225
280
  }
226
281
  ```
227
282
 
283
+ ### `detectFaces(imagePath)` / `detectBarcodes(imagePath)` / `detectRectangles(imagePath)` / `detectDocument(imagePath)` / `classify(imagePath)`
284
+
285
+ See `src/index.ts` for full type declarations.
286
+
228
287
  ---
229
288
 
230
289
  ## Why macos-vision?
231
290
 
232
291
  | | macos-vision | Tesseract.js | Cloud APIs |
233
292
  |---|---|---|---|
234
- | Offline | ✅ | ✅ | ❌ |
293
+ | Offline OCR | ✅ | ✅ | ❌ |
294
+ | Offline image → Markdown | ✅ (with local Ollama) | ❌ | ❌ |
235
295
  | No API key | ✅ | ✅ | ❌ |
236
296
  | Native speed | ✅ | ❌ | — |
237
297
  | Zero runtime deps | ✅ | ❌ | ❌ |
238
298
  | OCR with bounding boxes | ✅ | ✅ | ✅ |
239
- | Face detection | ✅ | ❌ | ✅ |
240
- | Barcode / QR | ✅ | ❌ | ✅ |
241
- | Document detection | ✅ | ❌ | ✅ |
299
+ | Face / barcode / document detection | ✅ | ❌ | ✅ |
242
300
  | Image classification | ✅ | ❌ | ✅ |
243
301
  | macOS only | ✅ | ❌ | ❌ |
244
302
 
package/bin/pdf-helper ADDED
Binary file
package/bin/vision-helper CHANGED
Binary file
package/dist/cli.js CHANGED
@@ -1,91 +1,154 @@
1
1
  #!/usr/bin/env node
2
- import { resolve } from 'path';
2
+ import { resolve, dirname, basename, extname, join } from 'path';
3
+ import { writeFile } from 'fs/promises';
3
4
  import { ocr, detectFaces, detectBarcodes, detectRectangles, detectDocument, classify, } from './index.js';
4
5
  const USAGE = `
5
- Usage: vision-cli [options] <image>
6
+ Usage: macos-vision [options] <image-or-pdf>
6
7
 
7
- Options:
8
- --ocr OCR — plain text (default)
9
- --blocks OCR — structured blocks with coordinates
10
- --faces Face detection
11
- --barcodes Barcode & QR code detection
12
- --rectangles Rectangle detection
13
- --document Document boundary detection
14
- --classify Image classification
15
- --all Run all of the above
8
+ Vision options:
9
+ --ocr OCR — plain text (default)
10
+ --blocks OCR — structured blocks with coordinates
11
+ --faces Face detection
12
+ --barcodes Barcode & QR code detection
13
+ --rectangles Rectangle detection
14
+ --document Document boundary detection
15
+ --classify Image classification
16
+ --all Run all of the above
16
17
 
17
- --help Show this help
18
+ Markdown options (requires Ollama running locally):
19
+ --markdown Convert image/PDF to Markdown via VisionScribe + Ollama
20
+ --model <name> Ollama model name (default: mistral-nemo)
21
+ --ollama-url <url> Ollama base URL (default: http://localhost:11434)
22
+ -o, --output <path> Write Markdown to specified file
23
+ --stdout Print Markdown to stdout instead of a file
24
+
25
+ --help Show this help
18
26
 
19
27
  Examples:
20
- vision-cli photo.jpg
21
- vision-cli --blocks --faces photo.jpg
22
- vision-cli --all photo.jpg
28
+ macos-vision photo.jpg
29
+ macos-vision --blocks --faces photo.jpg
30
+ macos-vision --all photo.jpg
31
+ macos-vision --markdown invoice.pdf -o notes.md
32
+ macos-vision --markdown receipt.jpg --stdout
23
33
  `.trim();
24
34
  const rawArgs = process.argv.slice(2);
25
35
  if (rawArgs.includes('--help') || rawArgs.length === 0) {
26
36
  console.log(USAGE);
27
37
  process.exit(0);
28
38
  }
29
- const flags = new Set(rawArgs.filter((a) => a.startsWith('--')));
30
- const fileArgs = rawArgs.filter((a) => !a.startsWith('--'));
39
+ // Strip value-bearing options first so the remaining tokens are either
40
+ // boolean flags (`--something`) or positional file paths.
41
+ function takeOpt(name, args) {
42
+ const i = args.indexOf(name);
43
+ if (i === -1)
44
+ return undefined;
45
+ const v = args[i + 1];
46
+ args.splice(i, 2);
47
+ return v;
48
+ }
49
+ const argv = [...rawArgs];
50
+ const model = takeOpt('--model', argv);
51
+ const ollamaUrl = takeOpt('--ollama-url', argv);
52
+ const outPath = takeOpt('-o', argv) ?? takeOpt('--output', argv);
53
+ const flags = new Set(argv.filter((a) => a.startsWith('--')));
54
+ const fileArgs = argv.filter((a) => !a.startsWith('-'));
31
55
  if (!fileArgs[0]) {
32
- console.error('Error: no image path provided.\n');
56
+ console.error('Error: no image or PDF path provided.\n');
33
57
  console.log(USAGE);
34
58
  process.exit(1);
35
59
  }
36
- const imagePath = resolve(fileArgs[0]);
37
- const runAll = flags.has('--all');
38
- const runOcr = runAll || flags.has('--ocr');
39
- const runBlocks = runAll || flags.has('--blocks');
40
- const runFaces = runAll || flags.has('--faces');
41
- const runBarcodes = runAll || flags.has('--barcodes');
42
- const runRects = runAll || flags.has('--rectangles');
43
- const runDoc = runAll || flags.has('--document');
44
- const runClassify = runAll || flags.has('--classify');
45
- // Default: OCR text when no feature flag is given
46
- const anyFeatureFlag = runAll ||
47
- flags.has('--ocr') ||
48
- flags.has('--blocks') ||
49
- flags.has('--faces') ||
50
- flags.has('--barcodes') ||
51
- flags.has('--rectangles') ||
52
- flags.has('--document') ||
53
- flags.has('--classify');
54
- const useDefault = !anyFeatureFlag;
55
- async function main() {
56
- try {
57
- if (useDefault || runOcr) {
58
- const text = await ocr(imagePath);
59
- console.log(text);
60
- }
61
- if (runBlocks) {
62
- const blocks = (await ocr(imagePath, { format: 'blocks' }));
63
- console.log(JSON.stringify(blocks, null, 2));
64
- }
65
- if (runFaces) {
66
- const faces = (await detectFaces(imagePath));
67
- console.log(JSON.stringify(faces, null, 2));
60
+ const inputPath = resolve(fileArgs[0]);
61
+ // ─── Markdown pipeline ─────────────────────────────────────────────────────────────
62
+ if (flags.has('--markdown')) {
63
+ const toStdout = flags.has('--stdout');
64
+ const opts = {};
65
+ if (model)
66
+ opts.model = model;
67
+ if (ollamaUrl)
68
+ opts.ollamaUrl = ollamaUrl;
69
+ (async () => {
70
+ const { VisionScribe, OllamaUnavailableError } = await import('./markdown/index.js');
71
+ const scribe = new VisionScribe(opts);
72
+ if (!toStdout)
73
+ process.stderr.write(`Converting ${fileArgs[0]}…\n`);
74
+ let markdown;
75
+ try {
76
+ markdown = await scribe.toMarkdown(inputPath);
68
77
  }
69
- if (runBarcodes) {
70
- const barcodes = (await detectBarcodes(imagePath));
71
- console.log(JSON.stringify(barcodes, null, 2));
78
+ catch (err) {
79
+ if (err instanceof OllamaUnavailableError) {
80
+ console.error(err.message);
81
+ process.exit(2);
82
+ }
83
+ throw err;
72
84
  }
73
- if (runRects) {
74
- const rectangles = (await detectRectangles(imagePath));
75
- console.log(JSON.stringify(rectangles, null, 2));
85
+ if (toStdout) {
86
+ process.stdout.write(markdown);
87
+ return;
76
88
  }
77
- if (runDoc) {
78
- const doc = (await detectDocument(imagePath));
79
- console.log(JSON.stringify(doc, null, 2));
89
+ const finalPath = outPath ??
90
+ join(dirname(inputPath), basename(inputPath, extname(inputPath)) + '.md');
91
+ await writeFile(finalPath, markdown, 'utf8');
92
+ process.stderr.write(`Saved: ${finalPath}\n`);
93
+ })().catch((err) => {
94
+ console.error(err instanceof Error ? err.message : String(err));
95
+ process.exit(1);
96
+ });
97
+ }
98
+ else {
99
+ // ─── Vision pipeline (OCR / detections / classification) ───────────────────────
100
+ const runAll = flags.has('--all');
101
+ const runOcr = runAll || flags.has('--ocr');
102
+ const runBlocks = runAll || flags.has('--blocks');
103
+ const runFaces = runAll || flags.has('--faces');
104
+ const runBarcodes = runAll || flags.has('--barcodes');
105
+ const runRects = runAll || flags.has('--rectangles');
106
+ const runDoc = runAll || flags.has('--document');
107
+ const runClassify = runAll || flags.has('--classify');
108
+ // Default: OCR text when no feature flag is given
109
+ const anyFeatureFlag = runAll ||
110
+ flags.has('--ocr') ||
111
+ flags.has('--blocks') ||
112
+ flags.has('--faces') ||
113
+ flags.has('--barcodes') ||
114
+ flags.has('--rectangles') ||
115
+ flags.has('--document') ||
116
+ flags.has('--classify');
117
+ const useDefault = !anyFeatureFlag;
118
+ (async () => {
119
+ try {
120
+ if (useDefault || runOcr) {
121
+ const text = await ocr(inputPath);
122
+ console.log(text);
123
+ }
124
+ if (runBlocks) {
125
+ const blocks = (await ocr(inputPath, { format: 'blocks' }));
126
+ console.log(JSON.stringify(blocks, null, 2));
127
+ }
128
+ if (runFaces) {
129
+ const faces = (await detectFaces(inputPath));
130
+ console.log(JSON.stringify(faces, null, 2));
131
+ }
132
+ if (runBarcodes) {
133
+ const barcodes = (await detectBarcodes(inputPath));
134
+ console.log(JSON.stringify(barcodes, null, 2));
135
+ }
136
+ if (runRects) {
137
+ const rectangles = (await detectRectangles(inputPath));
138
+ console.log(JSON.stringify(rectangles, null, 2));
139
+ }
140
+ if (runDoc) {
141
+ const doc = (await detectDocument(inputPath));
142
+ console.log(JSON.stringify(doc, null, 2));
143
+ }
144
+ if (runClassify) {
145
+ const labels = (await classify(inputPath));
146
+ console.log(JSON.stringify(labels, null, 2));
147
+ }
80
148
  }
81
- if (runClassify) {
82
- const labels = (await classify(imagePath));
83
- console.log(JSON.stringify(labels, null, 2));
149
+ catch (error) {
150
+ console.error('Error:', error);
151
+ process.exit(1);
84
152
  }
85
- }
86
- catch (error) {
87
- console.error('Error:', error);
88
- process.exit(1);
89
- }
153
+ })();
90
154
  }
91
- main();
package/dist/index.d.ts CHANGED
@@ -1,3 +1,24 @@
1
+ export interface PdfPage {
2
+ /** 0-based page index */
3
+ page: number;
4
+ /** Absolute path to the rasterized PNG file */
5
+ path: string;
6
+ }
7
+ export interface PdfRasterizeResult {
8
+ /** Pages in document order */
9
+ pages: PdfPage[];
10
+ /** Directory containing all rasterized PNGs */
11
+ cacheDir: string;
12
+ }
13
+ /**
14
+ * Rasterizes a PDF to 300 DPI PNG files using the native `pdf-helper` binary
15
+ * (PDFKit-based). Files are saved persistently to `~/.cache/macos-vision/`
16
+ * so they can be reused by downstream tools — **caller is responsible for cleanup**.
17
+ *
18
+ * @param pdfPath - Absolute or relative path to the PDF file.
19
+ * @returns An object with `pages` (sorted array of `{page, path}`) and `cacheDir`.
20
+ */
21
+ export declare function rasterizePdf(pdfPath: string): Promise<PdfRasterizeResult>;
1
22
  export interface VisionBlock {
2
23
  /** Recognized text */
3
24
  text: string;
@@ -91,3 +112,5 @@ export interface Classification {
91
112
  export declare function classify(imagePath: string): Promise<Classification[]>;
92
113
  export type { BlockKind, BaseBlock, TextBlock, FaceBlock, BarcodeBlock, RectangleBlock, DocumentBlock, LayoutBlock, InferLayoutInput, } from './layout.js';
93
114
  export { inferLayout, sortBlocksByReadingOrder } from './layout.js';
115
+ export { VisionScribe, OllamaUnavailableError } from './markdown/index.js';
116
+ export type { VisionScribeOptions, ParagraphGroup } from './markdown/index.js';