macos-vision 1.0.2 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +12 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.js +74 -1
- package/dist/layout.d.ts +8 -0
- package/dist/layout.js +8 -0
- package/package.json +1 -6
package/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,17 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## [1.1.0](https://github.com/woladi/macos-vision/compare/v1.0.3...v1.1.0) (2026-04-09)
|
|
4
|
+
|
|
5
|
+
### Features
|
|
6
|
+
|
|
7
|
+
* add PDF support via sips rasterization ([a48bf17](https://github.com/woladi/macos-vision/commit/a48bf17579a6df11aed6eadbde4fa5041ccaa981))
|
|
8
|
+
|
|
9
|
+
## [1.0.3](https://github.com/woladi/macos-vision/compare/v1.0.2...v1.0.3) (2026-04-08)
|
|
10
|
+
|
|
11
|
+
### Reverts
|
|
12
|
+
|
|
13
|
+
* remove socket.ignore field — worsens supply chain risk score ([a1827ad](https://github.com/woladi/macos-vision/commit/a1827ad489220ebb7a2e8c85632945fe969438db))
|
|
14
|
+
|
|
3
15
|
## [1.0.2](https://github.com/woladi/macos-vision/compare/v1.0.1...v1.0.2) (2026-04-08)
|
|
4
16
|
|
|
5
17
|
## [1.0.1](https://github.com/woladi/macos-vision/compare/v0.3.1...v1.0.1) (2026-04-08)
|
package/dist/index.d.ts
CHANGED
|
@@ -11,6 +11,8 @@ export interface VisionBlock {
|
|
|
11
11
|
height: number;
|
|
12
12
|
/** OCR transcription confidence, 0–1 */
|
|
13
13
|
confidence: number;
|
|
14
|
+
/** 0-based page index. Present only when the source was a PDF. Absent for images. */
|
|
15
|
+
page?: number;
|
|
14
16
|
}
|
|
15
17
|
export interface OcrOptions {
|
|
16
18
|
/** Return plain text (default) or structured blocks with coordinates */
|
package/dist/index.js
CHANGED
|
@@ -1,20 +1,93 @@
|
|
|
1
1
|
import { execFile } from 'child_process';
|
|
2
2
|
import { promisify } from 'util';
|
|
3
|
-
import { resolve, dirname } from 'path';
|
|
3
|
+
import { resolve, dirname, extname, join } from 'path';
|
|
4
4
|
import { fileURLToPath } from 'url';
|
|
5
|
+
import { tmpdir } from 'os';
|
|
6
|
+
import { open, mkdir, readdir, rm } from 'fs/promises';
|
|
5
7
|
const execFileAsync = promisify(execFile);
|
|
6
8
|
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
7
9
|
const BIN_PATH = resolve(__dirname, '../bin/vision-helper');
|
|
8
10
|
const BINARY_TIMEOUT_MS = 30_000;
|
|
11
|
+
const SIPS_TIMEOUT_MS = 60_000;
|
|
9
12
|
async function run(flag, imagePath) {
|
|
10
13
|
const { stdout } = await execFileAsync(BIN_PATH, [flag, resolve(imagePath)], {
|
|
11
14
|
timeout: BINARY_TIMEOUT_MS,
|
|
12
15
|
});
|
|
13
16
|
return stdout;
|
|
14
17
|
}
|
|
18
|
+
// ─── PDF helpers ─────────────────────────────────────────────────────────────
|
|
19
|
+
/**
|
|
20
|
+
* Returns true if the file at `filePath` is a PDF.
|
|
21
|
+
* Uses extension as a fast path; falls back to magic bytes (`%PDF`) for
|
|
22
|
+
* files whose extension does not match their actual content.
|
|
23
|
+
*/
|
|
24
|
+
async function isPdf(filePath) {
|
|
25
|
+
if (extname(filePath).toLowerCase() === '.pdf')
|
|
26
|
+
return true;
|
|
27
|
+
let fh;
|
|
28
|
+
try {
|
|
29
|
+
fh = await open(filePath, 'r');
|
|
30
|
+
const buf = Buffer.alloc(4);
|
|
31
|
+
await fh.read(buf, 0, 4, 0);
|
|
32
|
+
return buf[0] === 0x25 && buf[1] === 0x50 && buf[2] === 0x44 && buf[3] === 0x46;
|
|
33
|
+
}
|
|
34
|
+
finally {
|
|
35
|
+
await fh?.close();
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
/**
|
|
39
|
+
* Rasterizes a PDF to PNG files in `outDir` using macOS `sips`.
|
|
40
|
+
* Returns sorted list of absolute PNG paths (order = page order).
|
|
41
|
+
*
|
|
42
|
+
* sips names single-page output `{basename}.png` and multi-page output
|
|
43
|
+
* `{basename}-1.png`, `{basename}-2.png`, etc. The numeric sort handles both.
|
|
44
|
+
*/
|
|
45
|
+
async function rasterizePdf(pdfPath, outDir) {
|
|
46
|
+
await execFileAsync('sips', ['-s', 'format', 'png', '--resampleHeight', '2000', pdfPath, '--out', outDir], { timeout: SIPS_TIMEOUT_MS });
|
|
47
|
+
const entries = await readdir(outDir);
|
|
48
|
+
const pngs = entries.filter((n) => n.toLowerCase().endsWith('.png'));
|
|
49
|
+
pngs.sort((a, b) => {
|
|
50
|
+
const numA = parseInt(a.match(/-(\d+)\.png$/i)?.[1] ?? '0', 10);
|
|
51
|
+
const numB = parseInt(b.match(/-(\d+)\.png$/i)?.[1] ?? '0', 10);
|
|
52
|
+
return numA - numB;
|
|
53
|
+
});
|
|
54
|
+
return pngs.map((n) => join(outDir, n));
|
|
55
|
+
}
|
|
56
|
+
/**
|
|
57
|
+
* Full PDF OCR pipeline: rasterize → per-page OCR → merge results.
|
|
58
|
+
* Temporary PNG files are always cleaned up in the `finally` block.
|
|
59
|
+
*/
|
|
60
|
+
async function ocrPdf(pdfPath, format) {
|
|
61
|
+
const outDir = join(tmpdir(), `macos-vision-${globalThis.crypto.randomUUID()}`);
|
|
62
|
+
await mkdir(outDir, { recursive: true });
|
|
63
|
+
try {
|
|
64
|
+
const pages = await rasterizePdf(pdfPath, outDir);
|
|
65
|
+
if (format === 'blocks') {
|
|
66
|
+
const all = [];
|
|
67
|
+
for (let i = 0; i < pages.length; i++) {
|
|
68
|
+
const blocks = (await ocr(pages[i], { format: 'blocks' }));
|
|
69
|
+
all.push(...blocks.map((b) => ({ ...b, page: i })));
|
|
70
|
+
}
|
|
71
|
+
return all;
|
|
72
|
+
}
|
|
73
|
+
const texts = [];
|
|
74
|
+
for (let i = 0; i < pages.length; i++) {
|
|
75
|
+
texts.push((await ocr(pages[i])));
|
|
76
|
+
}
|
|
77
|
+
return texts.join('\n\n--- Page Break ---\n\n');
|
|
78
|
+
}
|
|
79
|
+
finally {
|
|
80
|
+
await rm(outDir, { recursive: true, force: true });
|
|
81
|
+
}
|
|
82
|
+
}
|
|
15
83
|
export async function ocr(imagePath, options = {}) {
|
|
16
84
|
const absPath = resolve(imagePath);
|
|
17
85
|
const { format = 'text' } = options;
|
|
86
|
+
// ── PDF fast-path: rasterize via sips, then OCR each page ────────────────
|
|
87
|
+
if (await isPdf(absPath)) {
|
|
88
|
+
return ocrPdf(absPath, format);
|
|
89
|
+
}
|
|
90
|
+
// ── Existing image path (unchanged) ──────────────────────────────────────
|
|
18
91
|
if (format === 'blocks') {
|
|
19
92
|
const { stdout } = await execFileAsync(BIN_PATH, ['--json', absPath], {
|
|
20
93
|
timeout: BINARY_TIMEOUT_MS,
|
package/dist/layout.d.ts
CHANGED
|
@@ -85,6 +85,14 @@ export declare function sortBlocksByReadingOrder(blocks: LayoutBlock[]): LayoutB
|
|
|
85
85
|
* (`paragraphId`) using simple bounding-box heuristics. All other block types
|
|
86
86
|
* are placed into the sorted sequence by their top-left coordinate.
|
|
87
87
|
*
|
|
88
|
+
* **Multi-page PDFs**: `VisionBlock` items from PDF OCR carry an optional `page` field (0-based).
|
|
89
|
+
* Because all coordinates are page-local (0–1 relative to each page), mixing blocks from
|
|
90
|
+
* different pages produces meaningless geometry. Pre-filter by page before calling inferLayout:
|
|
91
|
+
* ```ts
|
|
92
|
+
* const pageBlocks = pdfBlocks.filter(b => b.page === 0);
|
|
93
|
+
* const layout = inferLayout({ textBlocks: pageBlocks });
|
|
94
|
+
* ```
|
|
95
|
+
*
|
|
88
96
|
* @example
|
|
89
97
|
* ```ts
|
|
90
98
|
* const blocks = await ocr('page.png', { format: 'blocks' });
|
package/dist/layout.js
CHANGED
|
@@ -114,6 +114,14 @@ export function sortBlocksByReadingOrder(blocks) {
|
|
|
114
114
|
* (`paragraphId`) using simple bounding-box heuristics. All other block types
|
|
115
115
|
* are placed into the sorted sequence by their top-left coordinate.
|
|
116
116
|
*
|
|
117
|
+
* **Multi-page PDFs**: `VisionBlock` items from PDF OCR carry an optional `page` field (0-based).
|
|
118
|
+
* Because all coordinates are page-local (0–1 relative to each page), mixing blocks from
|
|
119
|
+
* different pages produces meaningless geometry. Pre-filter by page before calling inferLayout:
|
|
120
|
+
* ```ts
|
|
121
|
+
* const pageBlocks = pdfBlocks.filter(b => b.page === 0);
|
|
122
|
+
* const layout = inferLayout({ textBlocks: pageBlocks });
|
|
123
|
+
* ```
|
|
124
|
+
*
|
|
117
125
|
* @example
|
|
118
126
|
* ```ts
|
|
119
127
|
* const blocks = await ocr('page.png', { format: 'blocks' });
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "macos-vision",
|
|
3
|
-
"version": "1.0
|
|
3
|
+
"version": "1.1.0",
|
|
4
4
|
"description": "Apple Vision OCR & image analysis for Node.js — native, fast, offline, no API keys",
|
|
5
5
|
"author": "Adrian Wolczuk",
|
|
6
6
|
"license": "MIT",
|
|
@@ -48,11 +48,6 @@
|
|
|
48
48
|
"eslint --fix"
|
|
49
49
|
]
|
|
50
50
|
},
|
|
51
|
-
"socket": {
|
|
52
|
-
"ignore": {
|
|
53
|
-
"installScripts": true
|
|
54
|
-
}
|
|
55
|
-
},
|
|
56
51
|
"os": [
|
|
57
52
|
"darwin"
|
|
58
53
|
],
|