macos-vision 1.0.3 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,5 +1,17 @@
1
1
  # Changelog
2
2
 
3
+ ## [1.2.0](https://github.com/woladi/macos-vision/compare/v1.1.0...v1.2.0) (2026-04-09)
4
+
5
+ ### Features
6
+
7
+ * replace sips with PDFKit-based pdf-helper binary for PDF rasterization ([4a223e2](https://github.com/woladi/macos-vision/commit/4a223e2de79571794d866452fd5e87b84590ff0d))
8
+
9
+ ## [1.1.0](https://github.com/woladi/macos-vision/compare/v1.0.3...v1.1.0) (2026-04-09)
10
+
11
+ ### Features
12
+
13
+ * add PDF support via sips rasterization ([a48bf17](https://github.com/woladi/macos-vision/commit/a48bf17579a6df11aed6eadbde4fa5041ccaa981))
14
+
3
15
  ## [1.0.3](https://github.com/woladi/macos-vision/compare/v1.0.2...v1.0.3) (2026-04-08)
4
16
 
5
17
  ### Reverts
package/bin/pdf-helper ADDED
Binary file
package/dist/index.d.ts CHANGED
@@ -1,3 +1,24 @@
1
+ export interface PdfPage {
2
+ /** 0-based page index */
3
+ page: number;
4
+ /** Absolute path to the rasterized PNG file */
5
+ path: string;
6
+ }
7
+ export interface PdfRasterizeResult {
8
+ /** Pages in document order */
9
+ pages: PdfPage[];
10
+ /** Directory containing all rasterized PNGs */
11
+ cacheDir: string;
12
+ }
13
+ /**
14
+ * Rasterizes a PDF to 300 DPI PNG files using the native `pdf-helper` binary
15
+ * (PDFKit-based). Files are saved persistently to `~/.cache/macos-vision/`
16
+ * so they can be reused by downstream tools — **caller is responsible for cleanup**.
17
+ *
18
+ * @param pdfPath - Absolute or relative path to the PDF file.
19
+ * @returns An object with `pages` (sorted array of `{page, path}`) and `cacheDir`.
20
+ */
21
+ export declare function rasterizePdf(pdfPath: string): Promise<PdfRasterizeResult>;
1
22
  export interface VisionBlock {
2
23
  /** Recognized text */
3
24
  text: string;
@@ -11,6 +32,8 @@ export interface VisionBlock {
11
32
  height: number;
12
33
  /** OCR transcription confidence, 0–1 */
13
34
  confidence: number;
35
+ /** 0-based page index. Present only when the source was a PDF. Absent for images. */
36
+ page?: number;
14
37
  }
15
38
  export interface OcrOptions {
16
39
  /** Return plain text (default) or structured blocks with coordinates */
package/dist/index.js CHANGED
@@ -1,20 +1,85 @@
1
1
  import { execFile } from 'child_process';
2
2
  import { promisify } from 'util';
3
- import { resolve, dirname } from 'path';
3
+ import { resolve, dirname, extname, dirname as pathDirname } from 'path';
4
4
  import { fileURLToPath } from 'url';
5
+ import { open } from 'fs/promises';
5
6
  const execFileAsync = promisify(execFile);
6
7
  const __dirname = dirname(fileURLToPath(import.meta.url));
7
8
  const BIN_PATH = resolve(__dirname, '../bin/vision-helper');
9
+ const PDF_BIN_PATH = resolve(__dirname, '../bin/pdf-helper');
8
10
  const BINARY_TIMEOUT_MS = 30_000;
11
+ const PDF_RASTERIZE_TIMEOUT_MS = 120_000;
9
12
  async function run(flag, imagePath) {
10
13
  const { stdout } = await execFileAsync(BIN_PATH, [flag, resolve(imagePath)], {
11
14
  timeout: BINARY_TIMEOUT_MS,
12
15
  });
13
16
  return stdout;
14
17
  }
18
+ // ─── PDF helpers ─────────────────────────────────────────────────────────────
19
+ /**
20
+ * Returns true if the file at `filePath` is a PDF.
21
+ * Uses extension as a fast path; falls back to magic bytes (`%PDF`) for
22
+ * files whose extension does not match their actual content.
23
+ */
24
+ async function isPdf(filePath) {
25
+ if (extname(filePath).toLowerCase() === '.pdf')
26
+ return true;
27
+ let fh;
28
+ try {
29
+ fh = await open(filePath, 'r');
30
+ const buf = Buffer.alloc(4);
31
+ await fh.read(buf, 0, 4, 0);
32
+ return buf[0] === 0x25 && buf[1] === 0x50 && buf[2] === 0x44 && buf[3] === 0x46;
33
+ }
34
+ finally {
35
+ await fh?.close();
36
+ }
37
+ }
38
+ /**
39
+ * Rasterizes a PDF to 300 DPI PNG files using the native `pdf-helper` binary
40
+ * (PDFKit-based). Files are saved persistently to `~/.cache/macos-vision/`
41
+ * so they can be reused by downstream tools — **caller is responsible for cleanup**.
42
+ *
43
+ * @param pdfPath - Absolute or relative path to the PDF file.
44
+ * @returns An object with `pages` (sorted array of `{page, path}`) and `cacheDir`.
45
+ */
46
+ export async function rasterizePdf(pdfPath) {
47
+ const absPath = resolve(pdfPath);
48
+ const { stdout } = await execFileAsync(PDF_BIN_PATH, [absPath], {
49
+ timeout: PDF_RASTERIZE_TIMEOUT_MS,
50
+ });
51
+ const pages = JSON.parse(stdout);
52
+ const cacheDir = pages.length > 0 ? pathDirname(pages[0].path) : '';
53
+ return { pages, cacheDir };
54
+ }
55
+ /**
56
+ * Internal PDF OCR pipeline: rasterize via pdf-helper → OCR each page → merge.
57
+ * PNG files are NOT cleaned up — they persist in ~/.cache/macos-vision/.
58
+ */
59
+ async function ocrPdf(pdfPath, format) {
60
+ const { pages } = await rasterizePdf(pdfPath);
61
+ if (format === 'blocks') {
62
+ const all = [];
63
+ for (const { page, path: pagePath } of pages) {
64
+ const blocks = (await ocr(pagePath, { format: 'blocks' }));
65
+ all.push(...blocks.map((b) => ({ ...b, page })));
66
+ }
67
+ return all;
68
+ }
69
+ const texts = [];
70
+ for (const { path: pagePath } of pages) {
71
+ texts.push((await ocr(pagePath)));
72
+ }
73
+ return texts.join('\n\n--- Page Break ---\n\n');
74
+ }
15
75
  export async function ocr(imagePath, options = {}) {
16
76
  const absPath = resolve(imagePath);
17
77
  const { format = 'text' } = options;
78
+ // ── PDF fast-path: rasterize via sips, then OCR each page ────────────────
79
+ if (await isPdf(absPath)) {
80
+ return ocrPdf(absPath, format);
81
+ }
82
+ // ── Existing image path (unchanged) ──────────────────────────────────────
18
83
  if (format === 'blocks') {
19
84
  const { stdout } = await execFileAsync(BIN_PATH, ['--json', absPath], {
20
85
  timeout: BINARY_TIMEOUT_MS,
package/dist/layout.d.ts CHANGED
@@ -85,6 +85,14 @@ export declare function sortBlocksByReadingOrder(blocks: LayoutBlock[]): LayoutB
85
85
  * (`paragraphId`) using simple bounding-box heuristics. All other block types
86
86
  * are placed into the sorted sequence by their top-left coordinate.
87
87
  *
88
+ * **Multi-page PDFs**: `VisionBlock` items from PDF OCR carry an optional `page` field (0-based).
89
+ * Because all coordinates are page-local (0–1 relative to each page), mixing blocks from
90
+ * different pages produces meaningless geometry. Pre-filter by page before calling inferLayout:
91
+ * ```ts
92
+ * const pageBlocks = pdfBlocks.filter(b => b.page === 0);
93
+ * const layout = inferLayout({ textBlocks: pageBlocks });
94
+ * ```
95
+ *
88
96
  * @example
89
97
  * ```ts
90
98
  * const blocks = await ocr('page.png', { format: 'blocks' });
package/dist/layout.js CHANGED
@@ -114,6 +114,14 @@ export function sortBlocksByReadingOrder(blocks) {
114
114
  * (`paragraphId`) using simple bounding-box heuristics. All other block types
115
115
  * are placed into the sorted sequence by their top-left coordinate.
116
116
  *
117
+ * **Multi-page PDFs**: `VisionBlock` items from PDF OCR carry an optional `page` field (0-based).
118
+ * Because all coordinates are page-local (0–1 relative to each page), mixing blocks from
119
+ * different pages produces meaningless geometry. Pre-filter by page before calling inferLayout:
120
+ * ```ts
121
+ * const pageBlocks = pdfBlocks.filter(b => b.page === 0);
122
+ * const layout = inferLayout({ textBlocks: pageBlocks });
123
+ * ```
124
+ *
117
125
  * @example
118
126
  * ```ts
119
127
  * const blocks = await ocr('page.png', { format: 'blocks' });
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "macos-vision",
3
- "version": "1.0.3",
3
+ "version": "1.2.0",
4
4
  "description": "Apple Vision OCR & image analysis for Node.js — native, fast, offline, no API keys",
5
5
  "author": "Adrian Wolczuk",
6
6
  "license": "MIT",
@@ -6,21 +6,36 @@ import path from 'path';
6
6
  const __dirname = path.dirname(fileURLToPath(import.meta.url));
7
7
  const root = path.resolve(__dirname, '..');
8
8
  const binDir = path.join(root, 'bin');
9
- const binPath = path.join(binDir, 'vision-helper');
10
- const swiftSrc = path.join(root, 'src', 'native', 'vision-helper.swift');
11
9
 
12
- if (existsSync(binPath)) {
10
+ const binaries = [
11
+ {
12
+ src: path.join(root, 'src', 'native', 'vision-helper.swift'),
13
+ out: path.join(binDir, 'vision-helper'),
14
+ name: 'vision-helper',
15
+ },
16
+ {
17
+ src: path.join(root, 'src', 'native', 'pdf-helper.swift'),
18
+ out: path.join(binDir, 'pdf-helper'),
19
+ name: 'pdf-helper',
20
+ },
21
+ ];
22
+
23
+ const allExist = binaries.every(({ out }) => existsSync(out));
24
+ if (allExist) {
13
25
  process.exit(0);
14
26
  }
15
27
 
16
28
  mkdirSync(binDir, { recursive: true });
17
29
 
18
- try {
19
- execSync(`swiftc -O "${swiftSrc}" -o "${binPath}"`, { stdio: 'inherit' });
20
- console.log('✅ macos-vision: native binary compiled successfully');
21
- } catch {
22
- console.error('❌ macos-vision: Swift compilation failed.');
23
- console.error(' Make sure Xcode Command Line Tools are installed:');
24
- console.error(' xcode-select --install');
25
- process.exit(1);
30
+ for (const { src, out, name } of binaries) {
31
+ if (existsSync(out)) continue;
32
+ try {
33
+ execSync(`swiftc -O "${src}" -o "${out}"`, { stdio: 'inherit' });
34
+ console.log(`✅ macos-vision: ${name} compiled successfully`);
35
+ } catch {
36
+ console.error(`❌ macos-vision: ${name} compilation failed.`);
37
+ console.error(' Make sure Xcode Command Line Tools are installed:');
38
+ console.error(' xcode-select --install');
39
+ process.exit(1);
40
+ }
26
41
  }