npm - macos-vision - Versions diffs - 0.1.4 → 0.3.0 - Mend

macos-vision 0.1.4 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

package/CHANGELOG.md CHANGED Viewed

@@ -1,3 +1,20 @@
 # Changelog
+## [0.3.0](https://github.com/woladi/macos-vision/compare/v0.2.0...v0.3.0) (2026-04-08)
+### Features
+* add inferLayout() — unified reading-order LayoutBlock representation ([aec507e](https://github.com/woladi/macos-vision/commit/aec507eb7cf133ec1e56759c0945563a48d871ee))
+## [0.2.0](https://github.com/woladi/macos-vision/compare/v0.1.4...v0.2.0) (2026-04-08)
+### Features
+* add confidence to VisionBlock and Barcode ([a87df27](https://github.com/woladi/macos-vision/commit/a87df275e51dec4b57fbff6e3bffc4220b96b4d7))
+### Bug Fixes
+* correct mkdirSync, CLI error on missing file, execFile timeout, README scope ([1cef2c7](https://github.com/woladi/macos-vision/commit/1cef2c7078430c9182fcd39792cf0c002833203f))
+* replace try? with do/catch in Swift helper — surface Vision errors properly ([f287065](https://github.com/woladi/macos-vision/commit/f2870655225806070be3db462ea15923201fecbf))
 ## 0.1.4 (2026-04-08)

package/README.md CHANGED Viewed

@@ -22,6 +22,19 @@ npm install macos-vision
 The native Swift binary is compiled automatically on install.
+## What this is (and isn't)
+`macos-vision` gives you **raw Apple Vision results** — text, coordinates, bounding boxes, labels.
+It is **not** a document pipeline. It does not:
+- Convert PDFs or images to Markdown
+- Understand document structure (headings, tables, paragraphs)
+- Chain multiple detections into a final report
+For those use cases, use the raw output as input to an LLM or a post-processing layer of your own.
+---
 ## CLI
 ```bash
@@ -81,8 +94,46 @@ const doc = await detectDocument('photo.jpg') // DocumentBounds | null
 // Classify image content
 const labels = await classify('photo.jpg')
+// Layout inference — unified reading-order-sorted representation
+const layout = inferLayout({ textBlocks: blocks, faces, barcodes: codes })
+// layout is LayoutBlock[] — ready to feed into a Markdown renderer or LLM context
 ```
+### Layout inference
+`inferLayout` merges raw Vision results into a unified `LayoutBlock[]` sorted in reading order (top-to-bottom, left-to-right). Text blocks are grouped into **lines** and **paragraphs** using geometric heuristics.
+```ts
+import { ocr, detectFaces, detectBarcodes, inferLayout } from 'macos-vision';
+const blocks   = await ocr('page.png', { format: 'blocks' });
+const faces    = await detectFaces('page.png');
+const barcodes = await detectBarcodes('page.png');
+const layout = inferLayout({ textBlocks: blocks, faces, barcodes });
+for (const block of layout) {
+  if (block.kind === 'text') {
+    console.log(`[p${block.paragraphId} l${block.lineId}] ${block.text}`);
+  } else {
+    console.log(`[${block.kind}] at (${block.x.toFixed(2)}, ${block.y.toFixed(2)})`);
+  }
+}
+```
+`LayoutBlock` is a discriminated union — use `block.kind` to narrow the type:
+| `kind` | Extra fields |
+|--------|-------------|
+| `'text'` | `text`, `lineId`, `paragraphId` |
+| `'barcode'` | `value`, `type` |
+| `'face'` | — |
+| `'rectangle'` | — |
+| `'document'` | — |
+> **Note:** Layout inference is a heuristic layer. It does not understand multi-column layouts or rotated text. Treat it as structured input for downstream tools, not as ground truth.
 ## API
 ### `ocr(imagePath, options?)`

package/bin/vision-helper CHANGED Viewed

Binary file

package/dist/cli.js CHANGED Viewed

@@ -1,8 +1,6 @@
 #!/usr/bin/env node
-import { resolve, dirname } from 'path';
-import { fileURLToPath } from 'url';
+import { resolve } from 'path';
 import { ocr, detectFaces, detectBarcodes, detectRectangles, detectDocument, classify, } from './index.js';
-const __dirname = dirname(fileURLToPath(import.meta.url));
 const USAGE = `
 Usage: vision-cli [options] <image>
@@ -28,9 +26,14 @@ if (rawArgs.includes('--help') || rawArgs.length === 0) {
     console.log(USAGE);
     process.exit(0);
 }
-const flags = new Set(rawArgs.filter(a => a.startsWith('--')));
-const fileArgs = rawArgs.filter(a => !a.startsWith('--'));
-const imagePath = fileArgs[0] || resolve(__dirname, '../test/fixtures/sample.png');
+const flags = new Set(rawArgs.filter((a) => a.startsWith('--')));
+const fileArgs = rawArgs.filter((a) => !a.startsWith('--'));
+if (!fileArgs[0]) {
+    console.error('Error: no image path provided.\n');
+    console.log(USAGE);
+    process.exit(1);
+}
+const imagePath = resolve(fileArgs[0]);
 const runAll = flags.has('--all');
 const runOcr = runAll || flags.has('--ocr');
 const runBlocks = runAll || flags.has('--blocks');
@@ -40,9 +43,14 @@ const runRects = runAll || flags.has('--rectangles');
 const runDoc = runAll || flags.has('--document');
 const runClassify = runAll || flags.has('--classify');
 // Default: OCR text when no feature flag is given
-const anyFeatureFlag = runAll || flags.has('--ocr') || flags.has('--blocks') ||
-    flags.has('--faces') || flags.has('--barcodes') || flags.has('--rectangles') ||
-    flags.has('--document') || flags.has('--classify');
+const anyFeatureFlag = runAll ||
+    flags.has('--ocr') ||
+    flags.has('--blocks') ||
+    flags.has('--faces') ||
+    flags.has('--barcodes') ||
+    flags.has('--rectangles') ||
+    flags.has('--document') ||
+    flags.has('--classify');
 const useDefault = !anyFeatureFlag;
 async function main() {
     try {
@@ -51,27 +59,27 @@ async function main() {
             console.log(text);
         }
         if (runBlocks) {
-            const blocks = await ocr(imagePath, { format: 'blocks' });
+            const blocks = (await ocr(imagePath, { format: 'blocks' }));
             console.log(JSON.stringify(blocks, null, 2));
         }
         if (runFaces) {
-            const faces = await detectFaces(imagePath);
+            const faces = (await detectFaces(imagePath));
             console.log(JSON.stringify(faces, null, 2));
         }
         if (runBarcodes) {
-            const barcodes = await detectBarcodes(imagePath);
+            const barcodes = (await detectBarcodes(imagePath));
             console.log(JSON.stringify(barcodes, null, 2));
         }
         if (runRects) {
-            const rectangles = await detectRectangles(imagePath);
+            const rectangles = (await detectRectangles(imagePath));
             console.log(JSON.stringify(rectangles, null, 2));
         }
         if (runDoc) {
-            const doc = await detectDocument(imagePath);
+            const doc = (await detectDocument(imagePath));
             console.log(JSON.stringify(doc, null, 2));
         }
         if (runClassify) {
-            const labels = await classify(imagePath);
+            const labels = (await classify(imagePath));
             console.log(JSON.stringify(labels, null, 2));
         }
     }

package/dist/index.d.ts CHANGED Viewed

@@ -9,6 +9,8 @@ export interface VisionBlock {
     width: number;
     /** Height, 0–1 relative to image */
     height: number;
+    /** OCR transcription confidence, 0–1 */
+    confidence: number;
 }
 export interface OcrOptions {
     /** Return plain text (default) or structured blocks with coordinates */
@@ -46,6 +48,8 @@ export interface Barcode {
     width: number;
     /** Height, 0–1 relative to image */
     height: number;
+    /** Detection confidence, 0–1 */
+    confidence: number;
 }
 export declare function detectBarcodes(imagePath: string): Promise<Barcode[]>;
 export interface Rectangle {
@@ -83,3 +87,5 @@ export interface Classification {
 }
 /** Returns top image classifications sorted by confidence (highest first). */
 export declare function classify(imagePath: string): Promise<Classification[]>;
+export type { BlockKind, BaseBlock, TextBlock, FaceBlock, BarcodeBlock, RectangleBlock, DocumentBlock, LayoutBlock, InferLayoutInput, } from './layout.js';
+export { inferLayout, sortBlocksByReadingOrder } from './layout.js';

package/dist/index.js CHANGED Viewed

@@ -5,19 +5,31 @@ import { fileURLToPath } from 'url';
 const execFileAsync = promisify(execFile);
 const __dirname = dirname(fileURLToPath(import.meta.url));
 const BIN_PATH = resolve(__dirname, '../bin/vision-helper');
+const BINARY_TIMEOUT_MS = 30_000;
 async function run(flag, imagePath) {
-    const { stdout } = await execFileAsync(BIN_PATH, [flag, resolve(imagePath)]);
+    const { stdout } = await execFileAsync(BIN_PATH, [flag, resolve(imagePath)], {
+        timeout: BINARY_TIMEOUT_MS,
+    });
     return stdout;
 }
 export async function ocr(imagePath, options = {}) {
     const absPath = resolve(imagePath);
     const { format = 'text' } = options;
     if (format === 'blocks') {
-        const { stdout } = await execFileAsync(BIN_PATH, ['--json', absPath]);
+        const { stdout } = await execFileAsync(BIN_PATH, ['--json', absPath], {
+            timeout: BINARY_TIMEOUT_MS,
+        });
         const raw = JSON.parse(stdout);
-        return raw.map((b) => ({ text: b.t, x: b.x, y: b.y, width: b.w, height: b.h }));
+        return raw.map((b) => ({
+            text: b.t,
+            x: b.x,
+            y: b.y,
+            width: b.w,
+            height: b.h,
+            confidence: b.confidence,
+        }));
     }
-    const { stdout } = await execFileAsync(BIN_PATH, [absPath]);
+    const { stdout } = await execFileAsync(BIN_PATH, [absPath], { timeout: BINARY_TIMEOUT_MS });
     return stdout.trim();
 }
 export async function detectFaces(imagePath) {
@@ -33,6 +45,7 @@ export async function detectBarcodes(imagePath) {
         y: b.y,
         width: b.w,
         height: b.h,
+        confidence: b.confidence,
     }));
 }
 export async function detectRectangles(imagePath) {
@@ -49,3 +62,4 @@ export async function classify(imagePath) {
     const raw = JSON.parse(await run('--classify', imagePath));
     return raw;
 }
+export { inferLayout, sortBlocksByReadingOrder } from './layout.js';

package/dist/layout.d.ts ADDED Viewed

@@ -0,0 +1,98 @@
+/**
+ * @module layout
+ *
+ * Pure TypeScript layout inference layer for macos-vision.
+ *
+ * Takes raw Vision framework results and produces a unified, reading-order-sorted
+ * `LayoutBlock[]` that downstream tools (Markdown generators, LLM pipelines, etc.)
+ * can consume directly.
+ *
+ * **Limitations & intended usage**
+ * - This is a heuristic layer, not a full document parser. Line and paragraph
+ *   grouping uses simple geometric proximity — it will not be perfect for
+ *   multi-column layouts, rotated text, or unusual document structures.
+ * - No LLMs, no external dependencies, no I/O. Pure data-in → data-out.
+ * - Treat the output as a structured starting point, not ground truth.
+ */
+import type { VisionBlock, Face, Barcode, Rectangle, DocumentBounds } from './index.js';
+export type BlockKind = 'text' | 'face' | 'barcode' | 'rectangle' | 'document';
+export interface BaseBlock {
+    kind: BlockKind;
+    /** Horizontal position, 0–1 from left */
+    x: number;
+    /** Vertical position, 0–1 from top */
+    y: number;
+    /** Width, 0–1 relative to image */
+    width: number;
+    /** Height, 0–1 relative to image */
+    height: number;
+    /** Detection/recognition confidence, 0–1 (omitted when unavailable) */
+    confidence?: number;
+}
+export interface TextBlock extends BaseBlock {
+    kind: 'text';
+    /** Recognized text string */
+    text: string;
+    /**
+     * 0-based index of the visual line this block belongs to.
+     * Blocks sharing the same `lineId` are on the same horizontal line.
+     */
+    lineId: number;
+    /**
+     * 0-based index of the paragraph this block belongs to.
+     * A new paragraph begins when the vertical gap between lines exceeds
+     * ~1.5× the average line height.
+     */
+    paragraphId: number;
+}
+export interface FaceBlock extends BaseBlock {
+    kind: 'face';
+}
+export interface BarcodeBlock extends BaseBlock {
+    kind: 'barcode';
+    /** Decoded barcode / QR payload */
+    value: string;
+    /** Symbology, e.g. 'org.iso.QRCode', 'org.gs1.EAN-13' */
+    type: string;
+}
+export interface RectangleBlock extends BaseBlock {
+    kind: 'rectangle';
+}
+export interface DocumentBlock extends BaseBlock {
+    kind: 'document';
+}
+export type LayoutBlock = TextBlock | FaceBlock | BarcodeBlock | RectangleBlock | DocumentBlock;
+export interface InferLayoutInput {
+    textBlocks: VisionBlock[];
+    faces?: Face[];
+    barcodes?: Barcode[];
+    rectangles?: Rectangle[];
+    document?: DocumentBounds | null;
+}
+/**
+ * Sort any LayoutBlock array into reading order: top-to-bottom, then
+ * left-to-right within blocks that share the same approximate vertical band.
+ *
+ * Uses a 1% image-height tolerance so that blocks on the same visual row
+ * are ordered by `x` rather than by the tiny y differences between them.
+ */
+export declare function sortBlocksByReadingOrder(blocks: LayoutBlock[]): LayoutBlock[];
+/**
+ * Merge raw Apple Vision results into a unified, reading-order-sorted
+ * `LayoutBlock[]`.
+ *
+ * Text blocks are grouped into **lines** (`lineId`) and **paragraphs**
+ * (`paragraphId`) using simple bounding-box heuristics. All other block types
+ * are placed into the sorted sequence by their top-left coordinate.
+ *
+ * @example
+ * ```ts
+ * const blocks  = await ocr('page.png', { format: 'blocks' });
+ * const faces   = await detectFaces('page.png');
+ * const barcodes = await detectBarcodes('page.png');
+ *
+ * const layout = inferLayout({ textBlocks: blocks, faces, barcodes });
+ * // Feed `layout` into a Markdown renderer or an LLM context window.
+ * ```
+ */
+export declare function inferLayout(input: InferLayoutInput): LayoutBlock[];

package/dist/layout.js ADDED Viewed

@@ -0,0 +1,183 @@
+/**
+ * @module layout
+ *
+ * Pure TypeScript layout inference layer for macos-vision.
+ *
+ * Takes raw Vision framework results and produces a unified, reading-order-sorted
+ * `LayoutBlock[]` that downstream tools (Markdown generators, LLM pipelines, etc.)
+ * can consume directly.
+ *
+ * **Limitations & intended usage**
+ * - This is a heuristic layer, not a full document parser. Line and paragraph
+ *   grouping uses simple geometric proximity — it will not be perfect for
+ *   multi-column layouts, rotated text, or unusual document structures.
+ * - No LLMs, no external dependencies, no I/O. Pure data-in → data-out.
+ * - Treat the output as a structured starting point, not ground truth.
+ */
+// ─── Internal helpers ─────────────────────────────────────────────────────────
+/** Compute an axis-aligned bounding box from four corner points [x, y]. */
+function cornersToRect(corners) {
+    const xs = [
+        corners.topLeft[0],
+        corners.topRight[0],
+        corners.bottomLeft[0],
+        corners.bottomRight[0],
+    ];
+    const ys = [
+        corners.topLeft[1],
+        corners.topRight[1],
+        corners.bottomLeft[1],
+        corners.bottomRight[1],
+    ];
+    const x = Math.min(...xs);
+    const y = Math.min(...ys);
+    return { x, y, width: Math.max(...xs) - x, height: Math.max(...ys) - y };
+}
+/**
+ * Group text blocks into visual lines using y-center proximity.
+ *
+ * Two blocks are considered to be on the same line when the distance between
+ * their vertical centers is less than 60% of the taller block's height.
+ * Blocks within each line are sorted left-to-right by `x`.
+ */
+function groupTextIntoLines(blocks) {
+    if (blocks.length === 0)
+        return [];
+    const sorted = [...blocks].sort((a, b) => a.y + a.height / 2 - (b.y + b.height / 2));
+    const lines = [];
+    let currentLine = [sorted[0]];
+    let lineYCenter = sorted[0].y + sorted[0].height / 2;
+    for (let i = 1; i < sorted.length; i++) {
+        const block = sorted[i];
+        const blockYCenter = block.y + block.height / 2;
+        const threshold = Math.max(block.height, sorted[i - 1].height) * 0.6;
+        if (Math.abs(blockYCenter - lineYCenter) <= threshold) {
+            currentLine.push(block);
+            // Recompute line center as the mean of all members so far.
+            lineYCenter =
+                currentLine.reduce((sum, b) => sum + b.y + b.height / 2, 0) / currentLine.length;
+        }
+        else {
+            lines.push(currentLine.sort((a, b) => a.x - b.x));
+            currentLine = [block];
+            lineYCenter = blockYCenter;
+        }
+    }
+    lines.push(currentLine.sort((a, b) => a.x - b.x));
+    return lines;
+}
+/**
+ * Assign a paragraph index to each line.
+ *
+ * A new paragraph begins when the vertical gap between the bottom of one line
+ * and the top of the next exceeds 1.5× the average line height across all lines.
+ */
+function assignParagraphIds(lines) {
+    if (lines.length === 0)
+        return [];
+    const lineHeights = lines.map((line) => Math.max(...line.map((b) => b.height)));
+    const avgLineHeight = lineHeights.reduce((s, h) => s + h, 0) / lineHeights.length;
+    const ids = [0];
+    let paragraphId = 0;
+    for (let i = 1; i < lines.length; i++) {
+        const prevBottom = Math.max(...lines[i - 1].map((b) => b.y + b.height));
+        const currTop = Math.min(...lines[i].map((b) => b.y));
+        const gap = currTop - prevBottom;
+        if (gap > avgLineHeight * 1.5)
+            paragraphId++;
+        ids.push(paragraphId);
+    }
+    return ids;
+}
+/**
+ * Sort any LayoutBlock array into reading order: top-to-bottom, then
+ * left-to-right within blocks that share the same approximate vertical band.
+ *
+ * Uses a 1% image-height tolerance so that blocks on the same visual row
+ * are ordered by `x` rather than by the tiny y differences between them.
+ */
+export function sortBlocksByReadingOrder(blocks) {
+    return [...blocks].sort((a, b) => {
+        const dy = a.y - b.y;
+        // Treat blocks as being on the same row when y-difference < 1% of image height.
+        if (Math.abs(dy) > 0.01)
+            return dy;
+        return a.x - b.x;
+    });
+}
+// ─── Public API ───────────────────────────────────────────────────────────────
+/**
+ * Merge raw Apple Vision results into a unified, reading-order-sorted
+ * `LayoutBlock[]`.
+ *
+ * Text blocks are grouped into **lines** (`lineId`) and **paragraphs**
+ * (`paragraphId`) using simple bounding-box heuristics. All other block types
+ * are placed into the sorted sequence by their top-left coordinate.
+ *
+ * @example
+ * ```ts
+ * const blocks  = await ocr('page.png', { format: 'blocks' });
+ * const faces   = await detectFaces('page.png');
+ * const barcodes = await detectBarcodes('page.png');
+ *
+ * const layout = inferLayout({ textBlocks: blocks, faces, barcodes });
+ * // Feed `layout` into a Markdown renderer or an LLM context window.
+ * ```
+ */
+export function inferLayout(input) {
+    const result = [];
+    // ── Text blocks (with line / paragraph grouping) ──────────────────────────
+    const lines = groupTextIntoLines(input.textBlocks);
+    const paragraphIds = assignParagraphIds(lines);
+    lines.forEach((line, lineId) => {
+        const paragraphId = paragraphIds[lineId];
+        for (const b of line) {
+            result.push({
+                kind: 'text',
+                x: b.x,
+                y: b.y,
+                width: b.width,
+                height: b.height,
+                confidence: b.confidence,
+                text: b.text,
+                lineId,
+                paragraphId,
+            });
+        }
+    });
+    // ── Faces ─────────────────────────────────────────────────────────────────
+    for (const f of input.faces ?? []) {
+        result.push({
+            kind: 'face',
+            x: f.x,
+            y: f.y,
+            width: f.width,
+            height: f.height,
+            confidence: f.confidence,
+        });
+    }
+    // ── Barcodes ──────────────────────────────────────────────────────────────
+    for (const b of input.barcodes ?? []) {
+        result.push({
+            kind: 'barcode',
+            x: b.x,
+            y: b.y,
+            width: b.width,
+            height: b.height,
+            confidence: b.confidence,
+            value: b.value,
+            type: b.type,
+        });
+    }
+    // ── Rectangles ────────────────────────────────────────────────────────────
+    for (const r of input.rectangles ?? []) {
+        const bbox = cornersToRect(r);
+        result.push({ kind: 'rectangle', ...bbox, confidence: r.confidence });
+    }
+    // ── Document boundary ─────────────────────────────────────────────────────
+    if (input.document) {
+        const bbox = cornersToRect(input.document);
+        result.push({ kind: 'document', ...bbox, confidence: input.document.confidence });
+    }
+    return sortBlocksByReadingOrder(result);
+}

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "macos-vision",
-  "version": "0.1.4",
+  "version": "0.3.0",
   "description": "Apple Vision OCR & image analysis for Node.js — native, fast, offline, no API keys",
   "author": "Adrian Wolczuk",
   "license": "MIT",

package/scripts/build-native.js CHANGED Viewed

@@ -13,9 +13,7 @@ if (existsSync(binPath)) {
   process.exit(0);
 }
-if (!mkdirSync(binDir, { recursive: true }) === false) {
-  // dir created
-}
+mkdirSync(binDir, { recursive: true });
 try {
   execSync(`swiftc -O "${swiftSrc}" -o "${binPath}"`, { stdio: 'inherit' });