macos-vision 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,5 +1,11 @@
1
1
  # Changelog
2
2
 
3
+ ## [0.3.0](https://github.com/woladi/macos-vision/compare/v0.2.0...v0.3.0) (2026-04-08)
4
+
5
+ ### Features
6
+
7
+ * add inferLayout() — unified reading-order LayoutBlock representation ([aec507e](https://github.com/woladi/macos-vision/commit/aec507eb7cf133ec1e56759c0945563a48d871ee))
8
+
3
9
  ## [0.2.0](https://github.com/woladi/macos-vision/compare/v0.1.4...v0.2.0) (2026-04-08)
4
10
 
5
11
  ### Features
package/README.md CHANGED
@@ -94,8 +94,46 @@ const doc = await detectDocument('photo.jpg') // DocumentBounds | null
94
94
 
95
95
  // Classify image content
96
96
  const labels = await classify('photo.jpg')
97
+
98
+ // Layout inference — unified reading-order-sorted representation
99
+ const layout = inferLayout({ textBlocks: blocks, faces, barcodes: codes })
100
+ // layout is LayoutBlock[] — ready to feed into a Markdown renderer or LLM context
97
101
  ```
98
102
 
103
+ ### Layout inference
104
+
105
+ `inferLayout` merges raw Vision results into a unified `LayoutBlock[]` sorted in reading order (top-to-bottom, left-to-right). Text blocks are grouped into **lines** and **paragraphs** using geometric heuristics.
106
+
107
+ ```ts
108
+ import { ocr, detectFaces, detectBarcodes, inferLayout } from 'macos-vision';
109
+
110
+ const blocks = await ocr('page.png', { format: 'blocks' });
111
+ const faces = await detectFaces('page.png');
112
+ const barcodes = await detectBarcodes('page.png');
113
+
114
+ const layout = inferLayout({ textBlocks: blocks, faces, barcodes });
115
+
116
+ for (const block of layout) {
117
+ if (block.kind === 'text') {
118
+ console.log(`[p${block.paragraphId} l${block.lineId}] ${block.text}`);
119
+ } else {
120
+ console.log(`[${block.kind}] at (${block.x.toFixed(2)}, ${block.y.toFixed(2)})`);
121
+ }
122
+ }
123
+ ```
124
+
125
+ `LayoutBlock` is a discriminated union — use `block.kind` to narrow the type:
126
+
127
+ | `kind` | Extra fields |
128
+ |--------|-------------|
129
+ | `'text'` | `text`, `lineId`, `paragraphId` |
130
+ | `'barcode'` | `value`, `type` |
131
+ | `'face'` | — |
132
+ | `'rectangle'` | — |
133
+ | `'document'` | — |
134
+
135
+ > **Note:** Layout inference is a heuristic layer. It does not understand multi-column layouts or rotated text. Treat it as structured input for downstream tools, not as ground truth.
136
+
99
137
  ## API
100
138
 
101
139
  ### `ocr(imagePath, options?)`
package/dist/index.d.ts CHANGED
@@ -87,3 +87,5 @@ export interface Classification {
87
87
  }
88
88
  /** Returns top image classifications sorted by confidence (highest first). */
89
89
  export declare function classify(imagePath: string): Promise<Classification[]>;
90
+ export type { BlockKind, BaseBlock, TextBlock, FaceBlock, BarcodeBlock, RectangleBlock, DocumentBlock, LayoutBlock, InferLayoutInput, } from './layout.js';
91
+ export { inferLayout, sortBlocksByReadingOrder } from './layout.js';
package/dist/index.js CHANGED
@@ -62,3 +62,4 @@ export async function classify(imagePath) {
62
62
  const raw = JSON.parse(await run('--classify', imagePath));
63
63
  return raw;
64
64
  }
65
+ export { inferLayout, sortBlocksByReadingOrder } from './layout.js';
@@ -0,0 +1,98 @@
1
+ /**
2
+ * @module layout
3
+ *
4
+ * Pure TypeScript layout inference layer for macos-vision.
5
+ *
6
+ * Takes raw Vision framework results and produces a unified, reading-order-sorted
7
+ * `LayoutBlock[]` that downstream tools (Markdown generators, LLM pipelines, etc.)
8
+ * can consume directly.
9
+ *
10
+ * **Limitations & intended usage**
11
+ * - This is a heuristic layer, not a full document parser. Line and paragraph
12
+ * grouping uses simple geometric proximity — it will not be perfect for
13
+ * multi-column layouts, rotated text, or unusual document structures.
14
+ * - No LLMs, no external dependencies, no I/O. Pure data-in → data-out.
15
+ * - Treat the output as a structured starting point, not ground truth.
16
+ */
17
+ import type { VisionBlock, Face, Barcode, Rectangle, DocumentBounds } from './index.js';
18
+ export type BlockKind = 'text' | 'face' | 'barcode' | 'rectangle' | 'document';
19
+ export interface BaseBlock {
20
+ kind: BlockKind;
21
+ /** Horizontal position, 0–1 from left */
22
+ x: number;
23
+ /** Vertical position, 0–1 from top */
24
+ y: number;
25
+ /** Width, 0–1 relative to image */
26
+ width: number;
27
+ /** Height, 0–1 relative to image */
28
+ height: number;
29
+ /** Detection/recognition confidence, 0–1 (omitted when unavailable) */
30
+ confidence?: number;
31
+ }
32
+ export interface TextBlock extends BaseBlock {
33
+ kind: 'text';
34
+ /** Recognized text string */
35
+ text: string;
36
+ /**
37
+ * 0-based index of the visual line this block belongs to.
38
+ * Blocks sharing the same `lineId` are on the same horizontal line.
39
+ */
40
+ lineId: number;
41
+ /**
42
+ * 0-based index of the paragraph this block belongs to.
43
+ * A new paragraph begins when the vertical gap between lines exceeds
44
+ * ~1.5× the average line height.
45
+ */
46
+ paragraphId: number;
47
+ }
48
+ export interface FaceBlock extends BaseBlock {
49
+ kind: 'face';
50
+ }
51
+ export interface BarcodeBlock extends BaseBlock {
52
+ kind: 'barcode';
53
+ /** Decoded barcode / QR payload */
54
+ value: string;
55
+ /** Symbology, e.g. 'org.iso.QRCode', 'org.gs1.EAN-13' */
56
+ type: string;
57
+ }
58
+ export interface RectangleBlock extends BaseBlock {
59
+ kind: 'rectangle';
60
+ }
61
+ export interface DocumentBlock extends BaseBlock {
62
+ kind: 'document';
63
+ }
64
+ export type LayoutBlock = TextBlock | FaceBlock | BarcodeBlock | RectangleBlock | DocumentBlock;
65
+ export interface InferLayoutInput {
66
+ textBlocks: VisionBlock[];
67
+ faces?: Face[];
68
+ barcodes?: Barcode[];
69
+ rectangles?: Rectangle[];
70
+ document?: DocumentBounds | null;
71
+ }
72
+ /**
73
+ * Sort any LayoutBlock array into reading order: top-to-bottom, then
74
+ * left-to-right within blocks that share the same approximate vertical band.
75
+ *
76
+ * Uses a 1% image-height tolerance so that blocks on the same visual row
77
+ * are ordered by `x` rather than by the tiny y differences between them.
78
+ */
79
+ export declare function sortBlocksByReadingOrder(blocks: LayoutBlock[]): LayoutBlock[];
80
+ /**
81
+ * Merge raw Apple Vision results into a unified, reading-order-sorted
82
+ * `LayoutBlock[]`.
83
+ *
84
+ * Text blocks are grouped into **lines** (`lineId`) and **paragraphs**
85
+ * (`paragraphId`) using simple bounding-box heuristics. All other block types
86
+ * are placed into the sorted sequence by their top-left coordinate.
87
+ *
88
+ * @example
89
+ * ```ts
90
+ * const blocks = await ocr('page.png', { format: 'blocks' });
91
+ * const faces = await detectFaces('page.png');
92
+ * const barcodes = await detectBarcodes('page.png');
93
+ *
94
+ * const layout = inferLayout({ textBlocks: blocks, faces, barcodes });
95
+ * // Feed `layout` into a Markdown renderer or an LLM context window.
96
+ * ```
97
+ */
98
+ export declare function inferLayout(input: InferLayoutInput): LayoutBlock[];
package/dist/layout.js ADDED
@@ -0,0 +1,183 @@
1
+ /**
2
+ * @module layout
3
+ *
4
+ * Pure TypeScript layout inference layer for macos-vision.
5
+ *
6
+ * Takes raw Vision framework results and produces a unified, reading-order-sorted
7
+ * `LayoutBlock[]` that downstream tools (Markdown generators, LLM pipelines, etc.)
8
+ * can consume directly.
9
+ *
10
+ * **Limitations & intended usage**
11
+ * - This is a heuristic layer, not a full document parser. Line and paragraph
12
+ * grouping uses simple geometric proximity — it will not be perfect for
13
+ * multi-column layouts, rotated text, or unusual document structures.
14
+ * - No LLMs, no external dependencies, no I/O. Pure data-in → data-out.
15
+ * - Treat the output as a structured starting point, not ground truth.
16
+ */
17
+ // ─── Internal helpers ─────────────────────────────────────────────────────────
18
+ /** Compute an axis-aligned bounding box from four corner points [x, y]. */
19
+ function cornersToRect(corners) {
20
+ const xs = [
21
+ corners.topLeft[0],
22
+ corners.topRight[0],
23
+ corners.bottomLeft[0],
24
+ corners.bottomRight[0],
25
+ ];
26
+ const ys = [
27
+ corners.topLeft[1],
28
+ corners.topRight[1],
29
+ corners.bottomLeft[1],
30
+ corners.bottomRight[1],
31
+ ];
32
+ const x = Math.min(...xs);
33
+ const y = Math.min(...ys);
34
+ return { x, y, width: Math.max(...xs) - x, height: Math.max(...ys) - y };
35
+ }
36
+ /**
37
+ * Group text blocks into visual lines using y-center proximity.
38
+ *
39
+ * Two blocks are considered to be on the same line when the distance between
40
+ * their vertical centers is less than 60% of the taller block's height.
41
+ * Blocks within each line are sorted left-to-right by `x`.
42
+ */
43
+ function groupTextIntoLines(blocks) {
44
+ if (blocks.length === 0)
45
+ return [];
46
+ const sorted = [...blocks].sort((a, b) => a.y + a.height / 2 - (b.y + b.height / 2));
47
+ const lines = [];
48
+ let currentLine = [sorted[0]];
49
+ let lineYCenter = sorted[0].y + sorted[0].height / 2;
50
+ for (let i = 1; i < sorted.length; i++) {
51
+ const block = sorted[i];
52
+ const blockYCenter = block.y + block.height / 2;
53
+ const threshold = Math.max(block.height, sorted[i - 1].height) * 0.6;
54
+ if (Math.abs(blockYCenter - lineYCenter) <= threshold) {
55
+ currentLine.push(block);
56
+ // Recompute line center as the mean of all members so far.
57
+ lineYCenter =
58
+ currentLine.reduce((sum, b) => sum + b.y + b.height / 2, 0) / currentLine.length;
59
+ }
60
+ else {
61
+ lines.push(currentLine.sort((a, b) => a.x - b.x));
62
+ currentLine = [block];
63
+ lineYCenter = blockYCenter;
64
+ }
65
+ }
66
+ lines.push(currentLine.sort((a, b) => a.x - b.x));
67
+ return lines;
68
+ }
69
+ /**
70
+ * Assign a paragraph index to each line.
71
+ *
72
+ * A new paragraph begins when the vertical gap between the bottom of one line
73
+ * and the top of the next exceeds 1.5× the average line height across all lines.
74
+ */
75
+ function assignParagraphIds(lines) {
76
+ if (lines.length === 0)
77
+ return [];
78
+ const lineHeights = lines.map((line) => Math.max(...line.map((b) => b.height)));
79
+ const avgLineHeight = lineHeights.reduce((s, h) => s + h, 0) / lineHeights.length;
80
+ const ids = [0];
81
+ let paragraphId = 0;
82
+ for (let i = 1; i < lines.length; i++) {
83
+ const prevBottom = Math.max(...lines[i - 1].map((b) => b.y + b.height));
84
+ const currTop = Math.min(...lines[i].map((b) => b.y));
85
+ const gap = currTop - prevBottom;
86
+ if (gap > avgLineHeight * 1.5)
87
+ paragraphId++;
88
+ ids.push(paragraphId);
89
+ }
90
+ return ids;
91
+ }
92
+ /**
93
+ * Sort any LayoutBlock array into reading order: top-to-bottom, then
94
+ * left-to-right within blocks that share the same approximate vertical band.
95
+ *
96
+ * Uses a 1% image-height tolerance so that blocks on the same visual row
97
+ * are ordered by `x` rather than by the tiny y differences between them.
98
+ */
99
+ export function sortBlocksByReadingOrder(blocks) {
100
+ return [...blocks].sort((a, b) => {
101
+ const dy = a.y - b.y;
102
+ // Treat blocks as being on the same row when y-difference < 1% of image height.
103
+ if (Math.abs(dy) > 0.01)
104
+ return dy;
105
+ return a.x - b.x;
106
+ });
107
+ }
108
+ // ─── Public API ───────────────────────────────────────────────────────────────
109
+ /**
110
+ * Merge raw Apple Vision results into a unified, reading-order-sorted
111
+ * `LayoutBlock[]`.
112
+ *
113
+ * Text blocks are grouped into **lines** (`lineId`) and **paragraphs**
114
+ * (`paragraphId`) using simple bounding-box heuristics. All other block types
115
+ * are placed into the sorted sequence by their top-left coordinate.
116
+ *
117
+ * @example
118
+ * ```ts
119
+ * const blocks = await ocr('page.png', { format: 'blocks' });
120
+ * const faces = await detectFaces('page.png');
121
+ * const barcodes = await detectBarcodes('page.png');
122
+ *
123
+ * const layout = inferLayout({ textBlocks: blocks, faces, barcodes });
124
+ * // Feed `layout` into a Markdown renderer or an LLM context window.
125
+ * ```
126
+ */
127
+ export function inferLayout(input) {
128
+ const result = [];
129
+ // ── Text blocks (with line / paragraph grouping) ──────────────────────────
130
+ const lines = groupTextIntoLines(input.textBlocks);
131
+ const paragraphIds = assignParagraphIds(lines);
132
+ lines.forEach((line, lineId) => {
133
+ const paragraphId = paragraphIds[lineId];
134
+ for (const b of line) {
135
+ result.push({
136
+ kind: 'text',
137
+ x: b.x,
138
+ y: b.y,
139
+ width: b.width,
140
+ height: b.height,
141
+ confidence: b.confidence,
142
+ text: b.text,
143
+ lineId,
144
+ paragraphId,
145
+ });
146
+ }
147
+ });
148
+ // ── Faces ─────────────────────────────────────────────────────────────────
149
+ for (const f of input.faces ?? []) {
150
+ result.push({
151
+ kind: 'face',
152
+ x: f.x,
153
+ y: f.y,
154
+ width: f.width,
155
+ height: f.height,
156
+ confidence: f.confidence,
157
+ });
158
+ }
159
+ // ── Barcodes ──────────────────────────────────────────────────────────────
160
+ for (const b of input.barcodes ?? []) {
161
+ result.push({
162
+ kind: 'barcode',
163
+ x: b.x,
164
+ y: b.y,
165
+ width: b.width,
166
+ height: b.height,
167
+ confidence: b.confidence,
168
+ value: b.value,
169
+ type: b.type,
170
+ });
171
+ }
172
+ // ── Rectangles ────────────────────────────────────────────────────────────
173
+ for (const r of input.rectangles ?? []) {
174
+ const bbox = cornersToRect(r);
175
+ result.push({ kind: 'rectangle', ...bbox, confidence: r.confidence });
176
+ }
177
+ // ── Document boundary ─────────────────────────────────────────────────────
178
+ if (input.document) {
179
+ const bbox = cornersToRect(input.document);
180
+ result.push({ kind: 'document', ...bbox, confidence: input.document.confidence });
181
+ }
182
+ return sortBlocksByReadingOrder(result);
183
+ }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "macos-vision",
3
- "version": "0.2.0",
3
+ "version": "0.3.0",
4
4
  "description": "Apple Vision OCR & image analysis for Node.js — native, fast, offline, no API keys",
5
5
  "author": "Adrian Wolczuk",
6
6
  "license": "MIT",