npm - macos-vision - Versions diffs - 1.2.0 → 1.3.1 - Mend

macos-vision 1.2.0 → 1.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

package/README.md +168 -103
package/bin/pdf-helper +0 -0
package/bin/vision-helper +0 -0
package/dist/cli.js +131 -68
package/dist/index.d.ts +2 -0
package/dist/index.js +5 -3
package/dist/markdown/chunker.d.ts +11 -0
package/dist/markdown/chunker.js +39 -0
package/dist/markdown/index.d.ts +61 -0
package/dist/markdown/index.js +92 -0
package/dist/markdown/ollama.d.ts +21 -0
package/dist/markdown/ollama.js +50 -0
package/dist/markdown/prompt.d.ts +35 -0
package/dist/markdown/prompt.js +82 -0
package/package.json +30 -5
package/src/native/pdf-helper.swift +122 -0
package/src/native/vision-helper.swift +241 -0
package/.husky/commit-msg +0 -2
package/.husky/pre-commit +0 -3
package/.prettierignore +0 -4
package/.prettierrc.json +0 -7
package/.release-it.json +0 -20
package/CHANGELOG.md +0 -44
package/commitlint.config.js +0 -1
package/debug.js +0 -37
package/eslint.config.js +0 -21

package/README.md CHANGED Viewed

@@ -1,6 +1,6 @@
 # macos-vision
-> Apple Vision for Node.js — native, fast, offline, no API keys required.
+> Apple Vision for Node.js — native, fast, offline. Now with an optional Ollama-driven Markdown pipeline.
 Uses macOS's built-in [Vision framework](https://developer.apple.com/documentation/vision) via a compiled Swift binary. Works completely offline. No cloud services, no API keys, no Python, zero runtime dependencies.
@@ -8,11 +8,8 @@ Uses macOS's built-in [Vision framework](https://developer.apple.com/documentati
 - macOS 12+
 - Node.js 18+
-- Xcode Command Line Tools
-```bash
-xcode-select --install
-```
+- Xcode Command Line Tools (`xcode-select --install`)
+- [Ollama](https://ollama.com) running locally — only if you use the Markdown pipeline
 ## Installation
@@ -20,18 +17,18 @@ xcode-select --install
 npm install macos-vision
 ```
-The native Swift binary is compiled automatically on install.
-## What this is (and isn't)
-`macos-vision` gives you **raw Apple Vision results** — text, coordinates, bounding boxes, labels.
+The native Swift binaries (`vision-helper`, `pdf-helper`) are compiled automatically on install.
-It is **not** a document pipeline. It does not:
-- Convert PDFs or images to Markdown
-- Understand document structure (headings, tables, paragraphs)
-- Chain multiple detections into a final report
+## What you get
-For those use cases, use the raw output as input to an LLM or a post-processing layer of your own.
+| Capability | Engine | Network |
+|---|---|---|
+| OCR (text + bounding boxes) | Apple Vision | offline |
+| Face / barcode / rectangle / document detection | Apple Vision | offline |
+| Image classification | Apple Vision | offline |
+| Layout inference (lines, paragraphs, reading order) | heuristic in TypeScript | offline |
+| PDF rasterization | PDFKit (`pdf-helper`) | offline |
+| **Image / PDF → Markdown** | Apple Vision OCR + local LLM via Ollama | local LLM call |
 ---
@@ -44,60 +41,71 @@ npx macos-vision photo.jpg
 # Structured OCR blocks with bounding boxes
 npx macos-vision --blocks photo.jpg
-# Detect faces
+# Detections
 npx macos-vision --faces photo.jpg
-# Detect barcodes and QR codes
 npx macos-vision --barcodes photo.jpg
-# Detect rectangular shapes
 npx macos-vision --rectangles photo.jpg
-# Find document boundary
 npx macos-vision --document photo.jpg
-# Classify image content
 npx macos-vision --classify photo.jpg
 # Run all detections at once
 npx macos-vision --all photo.jpg
+# Image / PDF → Markdown via VisionScribe + Ollama
+npx macos-vision --markdown invoice.pdf -o notes.md
+npx macos-vision --markdown receipt.jpg --stdout
+npx macos-vision --markdown scan.png --model llama3.2
 ```
-Multiple flags can be combined: `npx macos-vision --blocks --faces --classify photo.jpg`
+Multiple Vision flags can be combined: `npx macos-vision --blocks --faces --classify photo.jpg`. Structured results are printed as JSON to stdout.
-Structured results are printed as JSON to stdout.
+### CLI flags
+| Flag | Description |
+|---|---|
+| `--ocr` | Plain text OCR (default when no flag is given) |
+| `--blocks` | OCR with bounding boxes (JSON) |
+| `--faces` / `--barcodes` / `--rectangles` / `--document` / `--classify` | Vision detections (JSON) |
+| `--all` | Run every Vision detection at once |
+| `--markdown` | Convert image / PDF to Markdown via VisionScribe + Ollama |
+| `--model <name>` | Ollama model (default: `mistral-nemo`). Only used with `--markdown` |
+| `--ollama-url <url>` | Ollama base URL (default: `http://localhost:11434`). Only used with `--markdown` |
+| `-o`, `--output <path>` | Write Markdown to a file. Only used with `--markdown` |
+| `--stdout` | Print Markdown to stdout instead of a file. Only used with `--markdown` |
+| `--help` | Show usage |
 ---
-## API
+## API — Vision
 ```js
-import { ocr, detectFaces, detectBarcodes, detectRectangles, detectDocument, classify } from 'macos-vision'
+import {
+  ocr,
+  detectFaces,
+  detectBarcodes,
+  detectRectangles,
+  detectDocument,
+  classify,
+  inferLayout,
+} from 'macos-vision';
 // OCR — plain text
-const text = await ocr('photo.jpg')
+const text = await ocr('photo.jpg');
 // OCR — structured blocks with bounding boxes
-const blocks = await ocr('photo.jpg', { format: 'blocks' })
-// Detect faces
-const faces = await detectFaces('photo.jpg')
-// Detect barcodes and QR codes
-const codes = await detectBarcodes('invoice.jpg')
+const blocks = await ocr('photo.jpg', { format: 'blocks' });
-// Detect rectangular shapes (tables, forms, cards)
-const rects = await detectRectangles('document.jpg')
-// Find document boundary in a photo
-const doc = await detectDocument('photo.jpg') // DocumentBounds | null
+// Detect faces / barcodes / rectangles / document boundary
+const faces = await detectFaces('photo.jpg');
+const codes = await detectBarcodes('invoice.jpg');
+const rects = await detectRectangles('document.jpg');
+const doc = await detectDocument('photo.jpg'); // DocumentBounds | null
 // Classify image content
-const labels = await classify('photo.jpg')
+const labels = await classify('photo.jpg');
 // Layout inference — unified reading-order-sorted representation
-const layout = inferLayout({ textBlocks: blocks, faces, barcodes: codes })
-// layout is LayoutBlock[] — ready to feed into a Markdown renderer or LLM context
+const layout = inferLayout({ textBlocks: blocks, faces, barcodes: codes });
 ```
 ### Layout inference
@@ -134,116 +142,173 @@ for (const block of layout) {
 > **Note:** Layout inference is a heuristic layer. It does not understand multi-column layouts or rotated text. Treat it as structured input for downstream tools, not as ground truth.
-## API
+---
-### `ocr(imagePath, options?)`
+## API — Markdown pipeline (VisionScribe)
-Extracts text from an image.
+`VisionScribe` converts an image or PDF to Markdown by combining Apple Vision OCR with a local LLM (via Ollama). The LLM never sees the image — it only formats text that Vision already extracted. This keeps image processing local and reduces the risk of vision-model hallucinations, but Markdown reconstruction is still best-effort and depends on the local model and document complexity.
-| Parameter | Type | Default | Description |
-|-----------|------|---------|-------------|
-| `imagePath` | `string` | — | Path to image (PNG, JPG, JPEG, WEBP) |
-| `options.format` | `'text' \| 'blocks'` | `'text'` | Plain text or structured blocks with coordinates |
+### Prerequisites
-Returns `Promise<string>` or `Promise<VisionBlock[]>`.
-```ts
-interface VisionBlock {
-  text: string
-  x: number       // 0–1 from left
-  y: number       // 0–1 from top
-  width: number   // 0–1
-  height: number  // 0–1
-}
+```bash
+brew install ollama
+ollama serve            # keep this running
+ollama pull mistral-nemo
 ```
----
+### Quick start
-### `detectFaces(imagePath)`
+```ts
+import { VisionScribe } from 'macos-vision';
-Detects human faces and returns their bounding boxes.
+const scribe = new VisionScribe();
+const markdown = await scribe.toMarkdown('receipt.png');
+console.log(markdown);
+```
+For a narrower import surface that pulls in only the markdown sub-module:
 ```ts
-interface Face {
-  x: number; y: number; width: number; height: number
-  confidence: number  // 0–1
-}
+import { VisionScribe } from 'macos-vision/markdown';
 ```
----
+### How it works
-### `detectBarcodes(imagePath)`
+```
+Image / PDF
+  │
+  ▼
+Apple Vision OCR          ← macOS native text extraction
+  │  VisionBlock[] per page
+  ▼
+Per-page layout inference ← each page processed independently (page-local coords)
+  │  paragraphId, lineId, y
+  ▼
+Chunker                   ← batches paragraphs to fit the LLM output window
+  │  ParagraphGroup[][]
+  ▼
+Ollama /api/chat          ← system prompt as role:"system", OCR text as role:"user"
+  │  temperature=0, top_p=1, num_predict=-1
+  ▼
+Markdown string           ← chunk results joined with blank lines
+```
-Detects barcodes and QR codes and decodes their payload.
+The LLM never sees the raw image; it only formats text that Apple Vision has already extracted. The system prompt asks the model to preserve the source text, avoid summarising, and avoid adding content. OCR text is wrapped in `<ocr_source>` tags so the model is less likely to treat document text as user instructions. Per-page processing keeps paragraph coordinates from different pages from being mixed.
-```ts
-interface Barcode {
-  type: string    // e.g. 'org.iso.QRCode', 'org.gs1.EAN-13'
-  value: string   // decoded content
-  x: number; y: number; width: number; height: number
-}
-```
+### `new VisionScribe(options?)`
----
+| Option | Type | Default | Description |
+|---|---|---|---|
+| `model` | `string` | `'mistral-nemo'` | Ollama model name |
+| `ollamaUrl` | `string` | `'http://localhost:11434'` | Base URL of the Ollama server |
+| `skipPing` | `boolean` | `false` | Skip per-call Ollama health check (useful in batch loops) |
+| `chunkSizeTokens` | `number` | `1800` | Max estimated output tokens per LLM chunk. Lower = more chunks (safer for small models); higher = fewer calls but risks hitting model output limits |
+### `scribe.toMarkdown(imagePath)`
-### `detectRectangles(imagePath)`
+- Accepts PNG, JPEG, HEIC, HEIF, TIFF, GIF, BMP, WebP and **PDF**
+- Returns an empty string `''` if no text is detected
+- Throws `OllamaUnavailableError` if the Ollama server is not reachable (unless `skipPing: true`)
-Finds rectangular shapes (documents, tables, cards, forms).
+### Batch processing
 ```ts
-interface Rectangle {
-  topLeft: [number, number]; topRight: [number, number]
-  bottomLeft: [number, number]; bottomRight: [number, number]
-  confidence: number
+import { VisionScribe, OllamaUnavailableError } from 'macos-vision';
+const scribe = new VisionScribe({ skipPing: true });
+for (const file of files) {
+  try {
+    const md = await scribe.toMarkdown(file);
+    // …
+  } catch (e) {
+    if (e instanceof OllamaUnavailableError) {
+      console.error(e.message);
+      break;
+    }
+    throw e;
+  }
 }
 ```
+### Known limitations
+- **Local model fidelity**: small models (`mistral-nemo`, `gemma`) may occasionally summarise or paraphrase long, dense documents. Larger models (`llama3.1:70b`, `qwen2.5:32b`) produce significantly better fidelity.
+- **Tables**: multi-column table layouts are partially supported. OCR reads cells in reading order but the LLM may not always reconstruct correct Markdown table syntax.
+- **Images / charts**: non-textual content (photos, diagrams, charts) is ignored — only text blocks extracted by Apple Vision are processed.
+- **Markdown fidelity**: the prompt strongly asks for faithful reconstruction, but LLM output is not a cryptographic or deterministic guarantee. Review important legal, financial, or compliance documents before relying on the generated Markdown.
 ---
-### `detectDocument(imagePath)`
+## Migrating from `macos-vision-md`
-Finds the boundary of a document in a photo (e.g. paper on a desk). Returns `null` if no document is found.
+The standalone [`macos-vision-md`](https://github.com/woladi/macos-vision-md) package has been merged into `macos-vision` as of v2.0.0. The old package will keep working as a thin re-export shim, but new projects should depend on `macos-vision` directly.
-```ts
-interface DocumentBounds {
-  topLeft: [number, number]; topRight: [number, number]
-  bottomLeft: [number, number]; bottomRight: [number, number]
-  confidence: number
-}
+```diff
+- import { VisionScribe } from 'macos-vision-md';
++ import { VisionScribe } from 'macos-vision';
 ```
+```diff
+- macos-vision-md invoice.pdf -o notes.md
++ macos-vision --markdown invoice.pdf -o notes.md
+```
+The `VisionScribe` API, the system prompt, and the chunking strategy are unchanged. `OllamaUnavailableError`, `VisionScribeOptions`, and `ParagraphGroup` are now exported from `macos-vision`.
 ---
-### `classify(imagePath)`
+## API reference — types
+### `ocr(imagePath, options?)`
-Returns top image classification labels with confidence scores.
+| Parameter | Type | Default | Description |
+|-----------|------|---------|-------------|
+| `imagePath` | `string` | — | Path to image (PNG, JPG, JPEG, WEBP) or PDF |
+| `options.format` | `'text' \| 'blocks'` | `'text'` | Plain text or structured blocks with coordinates |
+Returns `Promise<string>` or `Promise<VisionBlock[]>`.
 ```ts
-interface Classification {
-  identifier: string   // e.g. 'document', 'outdoor', 'animal'
-  confidence: number   // 0–1
+interface VisionBlock {
+  text: string
+  x: number       // 0–1 from left
+  y: number       // 0–1 from top
+  width: number   // 0–1
+  height: number  // 0–1
+  confidence: number
+  page?: number   // 0-based, only for PDFs
 }
 ```
+### `detectFaces(imagePath)` / `detectBarcodes(imagePath)` / `detectRectangles(imagePath)` / `detectDocument(imagePath)` / `classify(imagePath)`
+See `src/index.ts` for full type declarations.
 ---
 ## Why macos-vision?
 | | macos-vision | Tesseract.js | Cloud APIs |
 |---|---|---|---|
-| Offline | ✅ | ✅ | ❌ |
+| Offline OCR | ✅ | ✅ | ❌ |
+| Offline image → Markdown | ✅ (with local Ollama) | ❌ | ❌ |
 | No API key | ✅ | ✅ | ❌ |
 | Native speed | ✅ | ❌ | — |
 | Zero runtime deps | ✅ | ❌ | ❌ |
 | OCR with bounding boxes | ✅ | ✅ | ✅ |
-| Face detection | ✅ | ❌ | ✅ |
-| Barcode / QR | ✅ | ❌ | ✅ |
-| Document detection | ✅ | ❌ | ✅ |
+| Face / barcode / document detection | ✅ | ❌ | ✅ |
 | Image classification | ✅ | ❌ | ✅ |
 | macOS only | ✅ | ❌ | ❌ |
 Apple Vision is the same engine used by macOS Spotlight, Live Text, and Shortcuts — highly optimized and accurate.
+### OCR evaluation notes
+In internal tests on anonymized scanned contracts, forms, declarations, and UI screenshots, Apple Vision OCR produced fewer OCR artifacts than Tesseract in most cases. The strongest gains were on multi-column contract-style scans, where Apple Vision preserved substantially more usable text with far fewer artifacts. On simpler UI screenshots, both engines performed similarly.
+These results are directional rather than a public benchmark suite. The corpus is not included in this repository, and future benchmark fixtures should use synthetic or public-domain documents only.
 ## License
 MIT

package/bin/pdf-helper CHANGED Viewed

Binary file

package/bin/vision-helper CHANGED Viewed

Binary file

package/dist/cli.js CHANGED Viewed

@@ -1,91 +1,154 @@
 #!/usr/bin/env node
-import { resolve } from 'path';
+import { resolve, dirname, basename, extname, join } from 'path';
+import { writeFile } from 'fs/promises';
 import { ocr, detectFaces, detectBarcodes, detectRectangles, detectDocument, classify, } from './index.js';
 const USAGE = `
-Usage: vision-cli [options] <image>
+Usage: macos-vision [options] <image-or-pdf>
-Options:
-  --ocr           OCR — plain text (default)
-  --blocks        OCR — structured blocks with coordinates
-  --faces         Face detection
-  --barcodes      Barcode & QR code detection
-  --rectangles    Rectangle detection
-  --document      Document boundary detection
-  --classify      Image classification
-  --all           Run all of the above
+Vision options:
+  --ocr                  OCR — plain text (default)
+  --blocks               OCR — structured blocks with coordinates
+  --faces                Face detection
+  --barcodes             Barcode & QR code detection
+  --rectangles           Rectangle detection
+  --document             Document boundary detection
+  --classify             Image classification
+  --all                  Run all of the above
-  --help          Show this help
+Markdown options (requires Ollama running locally):
+  --markdown             Convert image/PDF to Markdown via VisionScribe + Ollama
+  --model <name>         Ollama model name (default: mistral-nemo)
+  --ollama-url <url>     Ollama base URL (default: http://localhost:11434)
+  -o, --output <path>    Write Markdown to specified file
+  --stdout               Print Markdown to stdout instead of a file
+  --help                 Show this help
 Examples:
-  vision-cli photo.jpg
-  vision-cli --blocks --faces photo.jpg
-  vision-cli --all photo.jpg
+  macos-vision photo.jpg
+  macos-vision --blocks --faces photo.jpg
+  macos-vision --all photo.jpg
+  macos-vision --markdown invoice.pdf -o notes.md
+  macos-vision --markdown receipt.jpg --stdout
 `.trim();
 const rawArgs = process.argv.slice(2);
 if (rawArgs.includes('--help') || rawArgs.length === 0) {
     console.log(USAGE);
     process.exit(0);
 }
-const flags = new Set(rawArgs.filter((a) => a.startsWith('--')));
-const fileArgs = rawArgs.filter((a) => !a.startsWith('--'));
+// Strip value-bearing options first so the remaining tokens are either
+// boolean flags (`--something`) or positional file paths.
+function takeOpt(name, args) {
+    const i = args.indexOf(name);
+    if (i === -1)
+        return undefined;
+    const v = args[i + 1];
+    args.splice(i, 2);
+    return v;
+}
+const argv = [...rawArgs];
+const model = takeOpt('--model', argv);
+const ollamaUrl = takeOpt('--ollama-url', argv);
+const outPath = takeOpt('-o', argv) ?? takeOpt('--output', argv);
+const flags = new Set(argv.filter((a) => a.startsWith('--')));
+const fileArgs = argv.filter((a) => !a.startsWith('-'));
 if (!fileArgs[0]) {
-    console.error('Error: no image path provided.\n');
+    console.error('Error: no image or PDF path provided.\n');
     console.log(USAGE);
     process.exit(1);
 }
-const imagePath = resolve(fileArgs[0]);
-const runAll = flags.has('--all');
-const runOcr = runAll || flags.has('--ocr');
-const runBlocks = runAll || flags.has('--blocks');
-const runFaces = runAll || flags.has('--faces');
-const runBarcodes = runAll || flags.has('--barcodes');
-const runRects = runAll || flags.has('--rectangles');
-const runDoc = runAll || flags.has('--document');
-const runClassify = runAll || flags.has('--classify');
-// Default: OCR text when no feature flag is given
-const anyFeatureFlag = runAll ||
-    flags.has('--ocr') ||
-    flags.has('--blocks') ||
-    flags.has('--faces') ||
-    flags.has('--barcodes') ||
-    flags.has('--rectangles') ||
-    flags.has('--document') ||
-    flags.has('--classify');
-const useDefault = !anyFeatureFlag;
-async function main() {
-    try {
-        if (useDefault || runOcr) {
-            const text = await ocr(imagePath);
-            console.log(text);
-        }
-        if (runBlocks) {
-            const blocks = (await ocr(imagePath, { format: 'blocks' }));
-            console.log(JSON.stringify(blocks, null, 2));
-        }
-        if (runFaces) {
-            const faces = (await detectFaces(imagePath));
-            console.log(JSON.stringify(faces, null, 2));
+const inputPath = resolve(fileArgs[0]);
+// ─── Markdown pipeline ─────────────────────────────────────────────────────────────
+if (flags.has('--markdown')) {
+    const toStdout = flags.has('--stdout');
+    const opts = {};
+    if (model)
+        opts.model = model;
+    if (ollamaUrl)
+        opts.ollamaUrl = ollamaUrl;
+    (async () => {
+        const { VisionScribe, OllamaUnavailableError } = await import('./markdown/index.js');
+        const scribe = new VisionScribe(opts);
+        if (!toStdout)
+            process.stderr.write(`Converting ${fileArgs[0]}…\n`);
+        let markdown;
+        try {
+            markdown = await scribe.toMarkdown(inputPath);
         }
-        if (runBarcodes) {
-            const barcodes = (await detectBarcodes(imagePath));
-            console.log(JSON.stringify(barcodes, null, 2));
+        catch (err) {
+            if (err instanceof OllamaUnavailableError) {
+                console.error(err.message);
+                process.exit(2);
+            }
+            throw err;
         }
-        if (runRects) {
-            const rectangles = (await detectRectangles(imagePath));
-            console.log(JSON.stringify(rectangles, null, 2));
+        if (toStdout) {
+            process.stdout.write(markdown);
+            return;
         }
-        if (runDoc) {
-            const doc = (await detectDocument(imagePath));
-            console.log(JSON.stringify(doc, null, 2));
+        const finalPath = outPath ??
+            join(dirname(inputPath), basename(inputPath, extname(inputPath)) + '.md');
+        await writeFile(finalPath, markdown, 'utf8');
+        process.stderr.write(`Saved: ${finalPath}\n`);
+    })().catch((err) => {
+        console.error(err instanceof Error ? err.message : String(err));
+        process.exit(1);
+    });
+}
+else {
+    // ─── Vision pipeline (OCR / detections / classification) ───────────────────────
+    const runAll = flags.has('--all');
+    const runOcr = runAll || flags.has('--ocr');
+    const runBlocks = runAll || flags.has('--blocks');
+    const runFaces = runAll || flags.has('--faces');
+    const runBarcodes = runAll || flags.has('--barcodes');
+    const runRects = runAll || flags.has('--rectangles');
+    const runDoc = runAll || flags.has('--document');
+    const runClassify = runAll || flags.has('--classify');
+    // Default: OCR text when no feature flag is given
+    const anyFeatureFlag = runAll ||
+        flags.has('--ocr') ||
+        flags.has('--blocks') ||
+        flags.has('--faces') ||
+        flags.has('--barcodes') ||
+        flags.has('--rectangles') ||
+        flags.has('--document') ||
+        flags.has('--classify');
+    const useDefault = !anyFeatureFlag;
+    (async () => {
+        try {
+            if (useDefault || runOcr) {
+                const text = await ocr(inputPath);
+                console.log(text);
+            }
+            if (runBlocks) {
+                const blocks = (await ocr(inputPath, { format: 'blocks' }));
+                console.log(JSON.stringify(blocks, null, 2));
+            }
+            if (runFaces) {
+                const faces = (await detectFaces(inputPath));
+                console.log(JSON.stringify(faces, null, 2));
+            }
+            if (runBarcodes) {
+                const barcodes = (await detectBarcodes(inputPath));
+                console.log(JSON.stringify(barcodes, null, 2));
+            }
+            if (runRects) {
+                const rectangles = (await detectRectangles(inputPath));
+                console.log(JSON.stringify(rectangles, null, 2));
+            }
+            if (runDoc) {
+                const doc = (await detectDocument(inputPath));
+                console.log(JSON.stringify(doc, null, 2));
+            }
+            if (runClassify) {
+                const labels = (await classify(inputPath));
+                console.log(JSON.stringify(labels, null, 2));
+            }
         }
-        if (runClassify) {
-            const labels = (await classify(imagePath));
-            console.log(JSON.stringify(labels, null, 2));
+        catch (error) {
+            console.error('Error:', error);
+            process.exit(1);
         }
-    }
-    catch (error) {
-        console.error('Error:', error);
-        process.exit(1);
-    }
+    })();
 }
-main();

package/dist/index.d.ts CHANGED Viewed

@@ -112,3 +112,5 @@ export interface Classification {
 export declare function classify(imagePath: string): Promise<Classification[]>;
 export type { BlockKind, BaseBlock, TextBlock, FaceBlock, BarcodeBlock, RectangleBlock, DocumentBlock, LayoutBlock, InferLayoutInput, } from './layout.js';
 export { inferLayout, sortBlocksByReadingOrder } from './layout.js';
+export { VisionScribe, OllamaUnavailableError } from './markdown/index.js';
+export type { VisionScribeOptions, ParagraphGroup } from './markdown/index.js';

package/dist/index.js CHANGED Viewed

@@ -15,7 +15,7 @@ async function run(flag, imagePath) {
     });
     return stdout;
 }
-// ─── PDF helpers ─────────────────────────────────────────────────────────────
+// ─── PDF helpers ─────────────────────────────────────────────────────
 /**
  * Returns true if the file at `filePath` is a PDF.
  * Uses extension as a fast path; falls back to magic bytes (`%PDF`) for
@@ -75,11 +75,11 @@ async function ocrPdf(pdfPath, format) {
 export async function ocr(imagePath, options = {}) {
     const absPath = resolve(imagePath);
     const { format = 'text' } = options;
-    // ── PDF fast-path: rasterize via sips, then OCR each page ────────────────
+    // ── PDF fast-path: rasterize via sips, then OCR each page ────────────
     if (await isPdf(absPath)) {
         return ocrPdf(absPath, format);
     }
-    // ── Existing image path (unchanged) ──────────────────────────────────────
+    // ── Existing image path (unchanged) ─────────────────────────────────
     if (format === 'blocks') {
         const { stdout } = await execFileAsync(BIN_PATH, ['--json', absPath], {
             timeout: BINARY_TIMEOUT_MS,
@@ -128,3 +128,5 @@ export async function classify(imagePath) {
     return raw;
 }
 export { inferLayout, sortBlocksByReadingOrder } from './layout.js';
+// ─── Markdown pipeline (VisionScribe) ──────────────────────────────────────────
+export { VisionScribe, OllamaUnavailableError } from './markdown/index.js';

package/dist/markdown/chunker.d.ts ADDED Viewed

@@ -0,0 +1,11 @@
+import type { ParagraphGroup } from './prompt.js';
+export declare function estimateTokens(text: string): number;
+/**
+ * Split an array of paragraphs into chunks where each chunk's estimated prompt
+ * token count stays within `chunkSizeTokens`. Paragraph boundaries are never
+ * split — chunks always break between `ParagraphGroup` objects.
+ *
+ * A paragraph whose estimated token count exceeds the budget on its own is
+ * emitted as a singleton chunk with a warning.
+ */
+export declare function chunkParagraphs(paragraphs: ParagraphGroup[], chunkSizeTokens: number): ParagraphGroup[][];