macos-vision 1.1.0 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -1,21 +1,21 @@
1
1
  import { execFile } from 'child_process';
2
2
  import { promisify } from 'util';
3
- import { resolve, dirname, extname, join } from 'path';
3
+ import { resolve, dirname, extname, dirname as pathDirname } from 'path';
4
4
  import { fileURLToPath } from 'url';
5
- import { tmpdir } from 'os';
6
- import { open, mkdir, readdir, rm } from 'fs/promises';
5
+ import { open } from 'fs/promises';
7
6
  const execFileAsync = promisify(execFile);
8
7
  const __dirname = dirname(fileURLToPath(import.meta.url));
9
8
  const BIN_PATH = resolve(__dirname, '../bin/vision-helper');
9
+ const PDF_BIN_PATH = resolve(__dirname, '../bin/pdf-helper');
10
10
  const BINARY_TIMEOUT_MS = 30_000;
11
- const SIPS_TIMEOUT_MS = 60_000;
11
+ const PDF_RASTERIZE_TIMEOUT_MS = 120_000;
12
12
  async function run(flag, imagePath) {
13
13
  const { stdout } = await execFileAsync(BIN_PATH, [flag, resolve(imagePath)], {
14
14
  timeout: BINARY_TIMEOUT_MS,
15
15
  });
16
16
  return stdout;
17
17
  }
18
- // ─── PDF helpers ─────────────────────────────────────────────────────────────
18
+ // ─── PDF helpers ─────────────────────────────────────────────────────
19
19
  /**
20
20
  * Returns true if the file at `filePath` is a PDF.
21
21
  * Uses extension as a fast path; falls back to magic bytes (`%PDF`) for
@@ -36,58 +36,50 @@ async function isPdf(filePath) {
36
36
  }
37
37
  }
38
38
  /**
39
- * Rasterizes a PDF to PNG files in `outDir` using macOS `sips`.
40
- * Returns sorted list of absolute PNG paths (order = page order).
39
+ * Rasterizes a PDF to 300 DPI PNG files using the native `pdf-helper` binary
40
+ * (PDFKit-based). Files are saved persistently to `~/.cache/macos-vision/`
41
+ * so they can be reused by downstream tools — **caller is responsible for cleanup**.
41
42
  *
42
- * sips names single-page output `{basename}.png` and multi-page output
43
- * `{basename}-1.png`, `{basename}-2.png`, etc. The numeric sort handles both.
43
+ * @param pdfPath - Absolute or relative path to the PDF file.
44
+ * @returns An object with `pages` (sorted array of `{page, path}`) and `cacheDir`.
44
45
  */
45
- async function rasterizePdf(pdfPath, outDir) {
46
- await execFileAsync('sips', ['-s', 'format', 'png', '--resampleHeight', '2000', pdfPath, '--out', outDir], { timeout: SIPS_TIMEOUT_MS });
47
- const entries = await readdir(outDir);
48
- const pngs = entries.filter((n) => n.toLowerCase().endsWith('.png'));
49
- pngs.sort((a, b) => {
50
- const numA = parseInt(a.match(/-(\d+)\.png$/i)?.[1] ?? '0', 10);
51
- const numB = parseInt(b.match(/-(\d+)\.png$/i)?.[1] ?? '0', 10);
52
- return numA - numB;
46
+ export async function rasterizePdf(pdfPath) {
47
+ const absPath = resolve(pdfPath);
48
+ const { stdout } = await execFileAsync(PDF_BIN_PATH, [absPath], {
49
+ timeout: PDF_RASTERIZE_TIMEOUT_MS,
53
50
  });
54
- return pngs.map((n) => join(outDir, n));
51
+ const pages = JSON.parse(stdout);
52
+ const cacheDir = pages.length > 0 ? pathDirname(pages[0].path) : '';
53
+ return { pages, cacheDir };
55
54
  }
56
55
  /**
57
- * Full PDF OCR pipeline: rasterize per-page OCR → merge results.
58
- * Temporary PNG files are always cleaned up in the `finally` block.
56
+ * Internal PDF OCR pipeline: rasterize via pdf-helper OCR each page → merge.
57
+ * PNG files are NOT cleaned up they persist in ~/.cache/macos-vision/.
59
58
  */
60
59
  async function ocrPdf(pdfPath, format) {
61
- const outDir = join(tmpdir(), `macos-vision-${globalThis.crypto.randomUUID()}`);
62
- await mkdir(outDir, { recursive: true });
63
- try {
64
- const pages = await rasterizePdf(pdfPath, outDir);
65
- if (format === 'blocks') {
66
- const all = [];
67
- for (let i = 0; i < pages.length; i++) {
68
- const blocks = (await ocr(pages[i], { format: 'blocks' }));
69
- all.push(...blocks.map((b) => ({ ...b, page: i })));
70
- }
71
- return all;
72
- }
73
- const texts = [];
74
- for (let i = 0; i < pages.length; i++) {
75
- texts.push((await ocr(pages[i])));
60
+ const { pages } = await rasterizePdf(pdfPath);
61
+ if (format === 'blocks') {
62
+ const all = [];
63
+ for (const { page, path: pagePath } of pages) {
64
+ const blocks = (await ocr(pagePath, { format: 'blocks' }));
65
+ all.push(...blocks.map((b) => ({ ...b, page })));
76
66
  }
77
- return texts.join('\n\n--- Page Break ---\n\n');
67
+ return all;
78
68
  }
79
- finally {
80
- await rm(outDir, { recursive: true, force: true });
69
+ const texts = [];
70
+ for (const { path: pagePath } of pages) {
71
+ texts.push((await ocr(pagePath)));
81
72
  }
73
+ return texts.join('\n\n--- Page Break ---\n\n');
82
74
  }
83
75
  export async function ocr(imagePath, options = {}) {
84
76
  const absPath = resolve(imagePath);
85
77
  const { format = 'text' } = options;
86
- // ── PDF fast-path: rasterize via sips, then OCR each page ────────────────
78
+ // ── PDF fast-path: rasterize via sips, then OCR each page ────────────
87
79
  if (await isPdf(absPath)) {
88
80
  return ocrPdf(absPath, format);
89
81
  }
90
- // ── Existing image path (unchanged) ──────────────────────────────────────
82
+ // ── Existing image path (unchanged) ─────────────────────────────────
91
83
  if (format === 'blocks') {
92
84
  const { stdout } = await execFileAsync(BIN_PATH, ['--json', absPath], {
93
85
  timeout: BINARY_TIMEOUT_MS,
@@ -136,3 +128,5 @@ export async function classify(imagePath) {
136
128
  return raw;
137
129
  }
138
130
  export { inferLayout, sortBlocksByReadingOrder } from './layout.js';
131
+ // ─── Markdown pipeline (VisionScribe) ──────────────────────────────────────────
132
+ export { VisionScribe, OllamaUnavailableError } from './markdown/index.js';
@@ -0,0 +1,11 @@
1
+ import type { ParagraphGroup } from './prompt.js';
2
+ export declare function estimateTokens(text: string): number;
3
+ /**
4
+ * Split an array of paragraphs into chunks where each chunk's estimated prompt
5
+ * token count stays within `chunkSizeTokens`. Paragraph boundaries are never
6
+ * split — chunks always break between `ParagraphGroup` objects.
7
+ *
8
+ * A paragraph whose estimated token count exceeds the budget on its own is
9
+ * emitted as a singleton chunk with a warning.
10
+ */
11
+ export declare function chunkParagraphs(paragraphs: ParagraphGroup[], chunkSizeTokens: number): ParagraphGroup[][];
@@ -0,0 +1,39 @@
1
+ import { buildPrompt } from './prompt.js';
2
+ export function estimateTokens(text) {
3
+ return Math.ceil(text.length / 4);
4
+ }
5
+ /**
6
+ * Split an array of paragraphs into chunks where each chunk's estimated prompt
7
+ * token count stays within `chunkSizeTokens`. Paragraph boundaries are never
8
+ * split — chunks always break between `ParagraphGroup` objects.
9
+ *
10
+ * A paragraph whose estimated token count exceeds the budget on its own is
11
+ * emitted as a singleton chunk with a warning.
12
+ */
13
+ export function chunkParagraphs(paragraphs, chunkSizeTokens) {
14
+ const result = [];
15
+ let currentBatch = [];
16
+ for (const p of paragraphs) {
17
+ const candidate = [...currentBatch, p];
18
+ const tokens = estimateTokens(buildPrompt(candidate));
19
+ if (currentBatch.length === 0) {
20
+ if (tokens > chunkSizeTokens) {
21
+ console.warn(`[macos-vision] Paragraph ${p.paragraphId} (page ${p.page}) ` +
22
+ `exceeds chunk budget (~${tokens} est. tokens > ${chunkSizeTokens}). ` +
23
+ `Processing as standalone chunk.`);
24
+ }
25
+ currentBatch = [p];
26
+ }
27
+ else if (tokens <= chunkSizeTokens) {
28
+ currentBatch = candidate;
29
+ }
30
+ else {
31
+ result.push(currentBatch);
32
+ currentBatch = [p];
33
+ }
34
+ }
35
+ if (currentBatch.length > 0) {
36
+ result.push(currentBatch);
37
+ }
38
+ return result;
39
+ }
@@ -0,0 +1,61 @@
1
+ export { OllamaUnavailableError } from './ollama.js';
2
+ export type { ParagraphGroup } from './prompt.js';
3
+ export interface VisionScribeOptions {
4
+ /**
5
+ * Ollama model name.
6
+ * @default 'mistral-nemo'
7
+ */
8
+ model?: string;
9
+ /**
10
+ * Base URL of the Ollama server.
11
+ * @default 'http://localhost:11434'
12
+ */
13
+ ollamaUrl?: string;
14
+ /**
15
+ * Skip the Ollama reachability check before each call.
16
+ * Useful in batch/eval contexts where you ping once upfront.
17
+ * @default false
18
+ */
19
+ skipPing?: boolean;
20
+ /**
21
+ * Maximum estimated output tokens per LLM chunk.
22
+ * Paragraphs are batched so that no single generate() call is expected
23
+ * to produce more than this many tokens. Lower values mean more (faster)
24
+ * chunks; higher values risk hitting the model's output token limit.
25
+ * @default 1800
26
+ */
27
+ chunkSizeTokens?: number;
28
+ }
29
+ /**
30
+ * Converts an image or PDF to structured Markdown using a two-stage pipeline:
31
+ *
32
+ * 1. **Apple Vision OCR** — extracts raw text blocks with bounding-box coordinates.
33
+ * PDFs are automatically rasterized page-by-page.
34
+ * 2. **Layout inference** — groups blocks by `paragraphId` per page using spatial
35
+ * heuristics (each page processed independently to avoid coordinate mixing).
36
+ * 3. **Chunking** — paragraphs are batched to stay within the LLM output token budget.
37
+ * 4. **Local LLM (Ollama)** — formats each chunk into clean Markdown without
38
+ * hallucinating new content.
39
+ *
40
+ * @example
41
+ * ```ts
42
+ * const scribe = new VisionScribe({ model: 'mistral-nemo' });
43
+ * const markdown = await scribe.toMarkdown('invoice.png');
44
+ * const mdFromPdf = await scribe.toMarkdown('report.pdf');
45
+ * ```
46
+ */
47
+ export declare class VisionScribe {
48
+ private readonly model;
49
+ private readonly ollamaUrl;
50
+ private readonly skipPing;
51
+ private readonly chunkSizeTokens;
52
+ constructor(options?: VisionScribeOptions);
53
+ /**
54
+ * Convert an image or PDF file to Markdown.
55
+ *
56
+ * @param imagePath Absolute or relative path to the image or PDF.
57
+ * @returns Markdown string. Empty string if no text was detected.
58
+ * @throws {OllamaUnavailableError} If the Ollama server cannot be reached.
59
+ */
60
+ toMarkdown(imagePath: string): Promise<string>;
61
+ }
@@ -0,0 +1,92 @@
1
+ import { ocr, inferLayout, sortBlocksByReadingOrder } from '../index.js';
2
+ import { ping, chat } from './ollama.js';
3
+ import { groupByParagraph, buildUserContent, SYSTEM_PROMPT } from './prompt.js';
4
+ import { chunkParagraphs } from './chunker.js';
5
+ export { OllamaUnavailableError } from './ollama.js';
6
+ /**
7
+ * Group raw OCR blocks by their page index.
8
+ * macos-vision attaches a `page` field (0-based) to blocks from PDFs.
9
+ * Single-image blocks have no `page` field and land in page 0.
10
+ *
11
+ * Coordinates in VisionBlock are always page-local (0–1), so blocks from
12
+ * different pages must NOT be passed together to inferLayout().
13
+ */
14
+ function groupBlocksByPage(blocks) {
15
+ const pages = new Map();
16
+ for (const block of blocks) {
17
+ const page = block.page ?? 0;
18
+ const existing = pages.get(page) ?? [];
19
+ existing.push(block);
20
+ pages.set(page, existing);
21
+ }
22
+ return pages;
23
+ }
24
+ /**
25
+ * Converts an image or PDF to structured Markdown using a two-stage pipeline:
26
+ *
27
+ * 1. **Apple Vision OCR** — extracts raw text blocks with bounding-box coordinates.
28
+ * PDFs are automatically rasterized page-by-page.
29
+ * 2. **Layout inference** — groups blocks by `paragraphId` per page using spatial
30
+ * heuristics (each page processed independently to avoid coordinate mixing).
31
+ * 3. **Chunking** — paragraphs are batched to stay within the LLM output token budget.
32
+ * 4. **Local LLM (Ollama)** — formats each chunk into clean Markdown without
33
+ * hallucinating new content.
34
+ *
35
+ * @example
36
+ * ```ts
37
+ * const scribe = new VisionScribe({ model: 'mistral-nemo' });
38
+ * const markdown = await scribe.toMarkdown('invoice.png');
39
+ * const mdFromPdf = await scribe.toMarkdown('report.pdf');
40
+ * ```
41
+ */
42
+ export class VisionScribe {
43
+ model;
44
+ ollamaUrl;
45
+ skipPing;
46
+ chunkSizeTokens;
47
+ constructor(options = {}) {
48
+ this.model = options.model ?? 'mistral-nemo';
49
+ this.ollamaUrl = options.ollamaUrl ?? 'http://localhost:11434';
50
+ this.skipPing = options.skipPing ?? false;
51
+ this.chunkSizeTokens = options.chunkSizeTokens ?? 1800;
52
+ }
53
+ /**
54
+ * Convert an image or PDF file to Markdown.
55
+ *
56
+ * @param imagePath Absolute or relative path to the image or PDF.
57
+ * @returns Markdown string. Empty string if no text was detected.
58
+ * @throws {OllamaUnavailableError} If the Ollama server cannot be reached.
59
+ */
60
+ async toMarkdown(imagePath) {
61
+ // 1. Ensure Ollama is reachable before doing expensive OCR work.
62
+ if (!this.skipPing)
63
+ await ping(this.ollamaUrl);
64
+ // 2. Extract raw OCR blocks via Apple Vision.
65
+ // For PDFs, macos-vision rasterizes each page and adds a `page` field.
66
+ const rawBlocks = await ocr(imagePath, { format: 'blocks' });
67
+ // 3. Split blocks by page — inferLayout() requires page-local coordinates.
68
+ const pageMap = groupBlocksByPage(rawBlocks);
69
+ // 4. Per page: infer layout → sort → group into paragraphs.
70
+ const allParagraphs = [];
71
+ for (const [pageIndex, pageBlocks] of [...pageMap.entries()].sort(([a], [b]) => a - b)) {
72
+ const layoutBlocks = inferLayout({ textBlocks: pageBlocks });
73
+ const sorted = sortBlocksByReadingOrder(layoutBlocks);
74
+ const paragraphs = groupByParagraph(sorted, pageIndex);
75
+ allParagraphs.push(...paragraphs);
76
+ }
77
+ if (allParagraphs.length === 0) {
78
+ return '';
79
+ }
80
+ // 5. Split paragraphs into chunks that fit within the output token budget.
81
+ const chunks = chunkParagraphs(allParagraphs, this.chunkSizeTokens);
82
+ // 6. Send each chunk to the LLM sequentially and join the results.
83
+ // System prompt goes as role:"system", OCR text as role:"user" — this
84
+ // prevents the model from treating instructions as content to summarise.
85
+ const parts = [];
86
+ for (const chunk of chunks) {
87
+ const part = await chat({ baseUrl: this.ollamaUrl, model: this.model }, SYSTEM_PROMPT, buildUserContent(chunk));
88
+ parts.push(part);
89
+ }
90
+ return parts.join('\n\n');
91
+ }
92
+ }
@@ -0,0 +1,21 @@
1
+ export interface OllamaOptions {
2
+ /** Base URL of the Ollama server. Default: `http://localhost:11434` */
3
+ baseUrl: string;
4
+ /** Model name to use for generation. Default: `mistral-nemo` */
5
+ model: string;
6
+ }
7
+ /**
8
+ * Verify that the Ollama server is reachable.
9
+ * Throws {@link OllamaUnavailableError} if the server cannot be contacted.
10
+ */
11
+ export declare function ping(baseUrl: string): Promise<void>;
12
+ /**
13
+ * Send a chat request to the Ollama /api/chat endpoint.
14
+ * Separating system and user roles significantly reduces hallucination compared
15
+ * to /api/generate where both are concatenated into a single completion string.
16
+ * Uses `stream: false` so the full response arrives in one shot.
17
+ */
18
+ export declare function chat(opts: OllamaOptions, systemPrompt: string, userContent: string): Promise<string>;
19
+ export declare class OllamaUnavailableError extends Error {
20
+ constructor(url: string);
21
+ }
@@ -0,0 +1,50 @@
1
+ /**
2
+ * Verify that the Ollama server is reachable.
3
+ * Throws {@link OllamaUnavailableError} if the server cannot be contacted.
4
+ */
5
+ export async function ping(baseUrl) {
6
+ try {
7
+ await fetch(`${baseUrl}/api/tags`, { signal: AbortSignal.timeout(3_000) });
8
+ }
9
+ catch {
10
+ throw new OllamaUnavailableError(baseUrl);
11
+ }
12
+ }
13
+ /**
14
+ * Send a chat request to the Ollama /api/chat endpoint.
15
+ * Separating system and user roles significantly reduces hallucination compared
16
+ * to /api/generate where both are concatenated into a single completion string.
17
+ * Uses `stream: false` so the full response arrives in one shot.
18
+ */
19
+ export async function chat(opts, systemPrompt, userContent) {
20
+ const res = await fetch(`${opts.baseUrl}/api/chat`, {
21
+ method: 'POST',
22
+ headers: { 'Content-Type': 'application/json' },
23
+ body: JSON.stringify({
24
+ model: opts.model,
25
+ messages: [
26
+ { role: 'system', content: systemPrompt },
27
+ { role: 'user', content: userContent },
28
+ ],
29
+ stream: false,
30
+ options: {
31
+ temperature: 0, // deterministic — no creative token selection
32
+ top_p: 1, // full vocabulary considered; determinism comes from temperature=0
33
+ num_predict: -1, // no output truncation
34
+ },
35
+ }),
36
+ signal: AbortSignal.timeout(600_000),
37
+ });
38
+ if (!res.ok) {
39
+ throw new Error(`Ollama request failed: ${res.status} ${res.statusText}`);
40
+ }
41
+ const data = (await res.json());
42
+ return data.message.content.trim();
43
+ }
44
+ export class OllamaUnavailableError extends Error {
45
+ constructor(url) {
46
+ super(`Ollama is not reachable at ${url}. ` +
47
+ `Make sure Ollama is running (e.g. \`ollama serve\`) and the URL is correct.`);
48
+ this.name = 'OllamaUnavailableError';
49
+ }
50
+ }
@@ -0,0 +1,35 @@
1
+ import type { LayoutBlock } from '../index.js';
2
+ /**
3
+ * A single paragraph: all TextBlocks sharing the same paragraphId,
4
+ * in reading order (top-to-bottom, left-to-right within each line).
5
+ */
6
+ export interface ParagraphGroup {
7
+ paragraphId: number;
8
+ /** Average y-coordinate of the first line — used as a spatial hint. */
9
+ y: number;
10
+ lines: string[];
11
+ /** Zero-based page index (always 0 for single images). */
12
+ page: number;
13
+ }
14
+ /**
15
+ * Group sorted layout blocks by paragraphId into ParagraphGroup objects.
16
+ * Non-text blocks (faces, barcodes, etc.) are ignored.
17
+ */
18
+ export declare function groupByParagraph(blocks: LayoutBlock[], pageIndex?: number): ParagraphGroup[];
19
+ /**
20
+ * The system prompt sent as `role: "system"` in every chat request.
21
+ * Kept separate from user content so the model treats it as hard constraints,
22
+ * not as text to be summarised or analysed.
23
+ */
24
+ export declare const SYSTEM_PROMPT = "ACT AS A HIGH-FIDELITY DOCUMENT PARSER. Your only goal is to reconstruct the provided OCR data into a structured Markdown document. NEVER skip text. NEVER summarize. Content must be 100% identical to the source.\n\nDO NOT SUMMARIZE.\nTranscribe every single word from the provided OCR data.\nMaintain 1:1 content fidelity. If the source has 5 paragraphs, the output must have 5 paragraphs.\n\nSTRICT OUTPUT: Output ONLY the Markdown representation. No preamble, no \"Summary of key events\", no \"Here is the result\".\n\nFORMATTING RULES:\n- Add # / ## / ### before lines that are clearly headings or titles\n- Add - before items that are clearly list entries\n- Join lines within the same paragraph into flowing prose\n- Preserve blank lines between paragraphs\n- Do NOT wrap output in code fences";
25
+ /**
26
+ * Build the user-facing content block from a list of paragraphs.
27
+ * This is the OCR text that the model will format — no instructions included.
28
+ * Sent as `role: "user"` in the chat request.
29
+ */
30
+ export declare function buildUserContent(paragraphs: ParagraphGroup[]): string;
31
+ /**
32
+ * Build the combined string used for token estimation in the chunker.
33
+ * Mirrors what will be sent to the model (system + user content).
34
+ */
35
+ export declare function buildPrompt(paragraphs: ParagraphGroup[]): string;
@@ -0,0 +1,82 @@
1
+ /**
2
+ * Group sorted layout blocks by paragraphId into ParagraphGroup objects.
3
+ * Non-text blocks (faces, barcodes, etc.) are ignored.
4
+ */
5
+ export function groupByParagraph(blocks, pageIndex = 0) {
6
+ const map = new Map();
7
+ for (const block of blocks) {
8
+ if (block.kind !== 'text')
9
+ continue;
10
+ if (!map.has(block.paragraphId)) {
11
+ map.set(block.paragraphId, { y: block.y, lines: new Map() });
12
+ }
13
+ const para = map.get(block.paragraphId);
14
+ const existing = para.lines.get(block.lineId) ?? [];
15
+ existing.push(block.text);
16
+ para.lines.set(block.lineId, existing);
17
+ }
18
+ const groups = [];
19
+ for (const [paragraphId, { y, lines }] of map) {
20
+ // Join tokens within each line, then collect lines in order
21
+ const lineStrings = [...lines.entries()]
22
+ .sort(([a], [b]) => a - b)
23
+ .map(([, tokens]) => tokens.join(' '));
24
+ groups.push({ paragraphId, y, lines: lineStrings, page: pageIndex });
25
+ }
26
+ return groups;
27
+ }
28
+ /**
29
+ * The system prompt sent as `role: "system"` in every chat request.
30
+ * Kept separate from user content so the model treats it as hard constraints,
31
+ * not as text to be summarised or analysed.
32
+ */
33
+ export const SYSTEM_PROMPT = `ACT AS A HIGH-FIDELITY DOCUMENT PARSER. \
34
+ Your only goal is to reconstruct the provided OCR data into a structured \
35
+ Markdown document. NEVER skip text. NEVER summarize. \
36
+ Content must be 100% identical to the source.
37
+
38
+ DO NOT SUMMARIZE.
39
+ Transcribe every single word from the provided OCR data.
40
+ Maintain 1:1 content fidelity. If the source has 5 paragraphs, the output must have 5 paragraphs.
41
+
42
+ STRICT OUTPUT: Output ONLY the Markdown representation. \
43
+ No preamble, no "Summary of key events", no "Here is the result".
44
+
45
+ FORMATTING RULES:
46
+ - Add # / ## / ### before lines that are clearly headings or titles
47
+ - Add - before items that are clearly list entries
48
+ - Join lines within the same paragraph into flowing prose
49
+ - Preserve blank lines between paragraphs
50
+ - Do NOT wrap output in code fences`;
51
+ /**
52
+ * Build the user-facing content block from a list of paragraphs.
53
+ * This is the OCR text that the model will format — no instructions included.
54
+ * Sent as `role: "user"` in the chat request.
55
+ */
56
+ export function buildUserContent(paragraphs) {
57
+ const pageNumbers = [...new Set(paragraphs.map(p => p.page))].sort((a, b) => a - b);
58
+ const multiPage = pageNumbers.length > 1;
59
+ const blocks = [];
60
+ for (const pageNum of pageNumbers) {
61
+ if (multiPage) {
62
+ blocks.push(`[Page ${pageNum + 1}]`);
63
+ }
64
+ const pageParagraphs = paragraphs.filter(p => p.page === pageNum);
65
+ for (const { paragraphId, y, lines } of pageParagraphs) {
66
+ const yHint = y.toFixed(2);
67
+ const header = `[Paragraph ${paragraphId}, y≈${yHint}]`;
68
+ blocks.push(`${header}\n${lines.join('\n')}`);
69
+ }
70
+ }
71
+ const task = 'Convert the OCR source below into Markdown. ' +
72
+ 'Reproduce EVERY word EXACTLY. Do not respond, explain, or ask questions.\n\n' +
73
+ '<ocr_source>';
74
+ return `${task}\n\n${blocks.join('\n\n')}\n</ocr_source>`;
75
+ }
76
+ /**
77
+ * Build the combined string used for token estimation in the chunker.
78
+ * Mirrors what will be sent to the model (system + user content).
79
+ */
80
+ export function buildPrompt(paragraphs) {
81
+ return `${SYSTEM_PROMPT}\n\n${buildUserContent(paragraphs)}`;
82
+ }
package/package.json CHANGED
@@ -1,15 +1,33 @@
1
1
  {
2
2
  "name": "macos-vision",
3
- "version": "1.1.0",
4
- "description": "Apple Vision OCR & image analysis for Node.js — native, fast, offline, no API keys",
3
+ "version": "1.3.0",
4
+ "description": "Apple Vision OCR + image/PDF analysis for Node.js, with optional Ollama-driven Markdown pipeline — native, fast, offline",
5
5
  "author": "Adrian Wolczuk",
6
6
  "license": "MIT",
7
7
  "type": "module",
8
8
  "main": "./dist/index.js",
9
9
  "types": "./dist/index.d.ts",
10
10
  "bin": {
11
- "macos-vision": "./dist/cli.js"
11
+ "macos-vision": "dist/cli.js"
12
12
  },
13
+ "exports": {
14
+ ".": {
15
+ "import": "./dist/index.js",
16
+ "types": "./dist/index.d.ts"
17
+ },
18
+ "./markdown": {
19
+ "import": "./dist/markdown/index.js",
20
+ "types": "./dist/markdown/index.d.ts"
21
+ }
22
+ },
23
+ "files": [
24
+ "dist",
25
+ "bin",
26
+ "scripts/build-native.js",
27
+ "src/native",
28
+ "README.md",
29
+ "LICENSE"
30
+ ],
13
31
  "repository": {
14
32
  "type": "git",
15
33
  "url": "git+https://github.com/woladi/macos-vision.git"
@@ -25,7 +43,8 @@
25
43
  "lint": "eslint src/**/*.ts",
26
44
  "format": "prettier --write src/**/*.ts",
27
45
  "release": "release-it",
28
- "release:beta": "release-it --preRelease=beta"
46
+ "release:beta": "release-it --preRelease=beta",
47
+ "eval:setup": "git clone https://github.com/opendataloader-project/opendataloader-bench eval/bench || echo 'bench already cloned'"
29
48
  },
30
49
  "keywords": [
31
50
  "ocr",
@@ -40,7 +59,13 @@
40
59
  "barcode",
41
60
  "qr-code",
42
61
  "document-detection",
43
- "image-classification"
62
+ "image-classification",
63
+ "markdown",
64
+ "image-to-markdown",
65
+ "pdf-to-markdown",
66
+ "ollama",
67
+ "llm",
68
+ "document-pipeline"
44
69
  ],
45
70
  "lint-staged": {
46
71
  "src/**/*.ts": [
@@ -6,21 +6,36 @@ import path from 'path';
6
6
  const __dirname = path.dirname(fileURLToPath(import.meta.url));
7
7
  const root = path.resolve(__dirname, '..');
8
8
  const binDir = path.join(root, 'bin');
9
- const binPath = path.join(binDir, 'vision-helper');
10
- const swiftSrc = path.join(root, 'src', 'native', 'vision-helper.swift');
11
9
 
12
- if (existsSync(binPath)) {
10
+ const binaries = [
11
+ {
12
+ src: path.join(root, 'src', 'native', 'vision-helper.swift'),
13
+ out: path.join(binDir, 'vision-helper'),
14
+ name: 'vision-helper',
15
+ },
16
+ {
17
+ src: path.join(root, 'src', 'native', 'pdf-helper.swift'),
18
+ out: path.join(binDir, 'pdf-helper'),
19
+ name: 'pdf-helper',
20
+ },
21
+ ];
22
+
23
+ const allExist = binaries.every(({ out }) => existsSync(out));
24
+ if (allExist) {
13
25
  process.exit(0);
14
26
  }
15
27
 
16
28
  mkdirSync(binDir, { recursive: true });
17
29
 
18
- try {
19
- execSync(`swiftc -O "${swiftSrc}" -o "${binPath}"`, { stdio: 'inherit' });
20
- console.log('✅ macos-vision: native binary compiled successfully');
21
- } catch {
22
- console.error('❌ macos-vision: Swift compilation failed.');
23
- console.error(' Make sure Xcode Command Line Tools are installed:');
24
- console.error(' xcode-select --install');
25
- process.exit(1);
30
+ for (const { src, out, name } of binaries) {
31
+ if (existsSync(out)) continue;
32
+ try {
33
+ execSync(`swiftc -O "${src}" -o "${out}"`, { stdio: 'inherit' });
34
+ console.log(`✅ macos-vision: ${name} compiled successfully`);
35
+ } catch {
36
+ console.error(`❌ macos-vision: ${name} compilation failed.`);
37
+ console.error(' Make sure Xcode Command Line Tools are installed:');
38
+ console.error(' xcode-select --install');
39
+ process.exit(1);
40
+ }
26
41
  }