macos-vision 1.2.0 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +161 -103
- package/bin/pdf-helper +0 -0
- package/bin/vision-helper +0 -0
- package/dist/cli.js +131 -68
- package/dist/index.d.ts +2 -0
- package/dist/index.js +5 -3
- package/dist/markdown/chunker.d.ts +11 -0
- package/dist/markdown/chunker.js +39 -0
- package/dist/markdown/index.d.ts +61 -0
- package/dist/markdown/index.js +92 -0
- package/dist/markdown/ollama.d.ts +21 -0
- package/dist/markdown/ollama.js +50 -0
- package/dist/markdown/prompt.d.ts +35 -0
- package/dist/markdown/prompt.js +82 -0
- package/package.json +30 -5
- package/src/native/pdf-helper.swift +122 -0
- package/src/native/vision-helper.swift +241 -0
- package/.husky/commit-msg +0 -2
- package/.husky/pre-commit +0 -3
- package/.prettierignore +0 -4
- package/.prettierrc.json +0 -7
- package/.release-it.json +0 -20
- package/CHANGELOG.md +0 -44
- package/commitlint.config.js +0 -1
- package/debug.js +0 -37
- package/eslint.config.js +0 -21
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
import { buildPrompt } from './prompt.js';
|
|
2
|
+
export function estimateTokens(text) {
|
|
3
|
+
return Math.ceil(text.length / 4);
|
|
4
|
+
}
|
|
5
|
+
/**
|
|
6
|
+
* Split an array of paragraphs into chunks where each chunk's estimated prompt
|
|
7
|
+
* token count stays within `chunkSizeTokens`. Paragraph boundaries are never
|
|
8
|
+
* split — chunks always break between `ParagraphGroup` objects.
|
|
9
|
+
*
|
|
10
|
+
* A paragraph whose estimated token count exceeds the budget on its own is
|
|
11
|
+
* emitted as a singleton chunk with a warning.
|
|
12
|
+
*/
|
|
13
|
+
export function chunkParagraphs(paragraphs, chunkSizeTokens) {
|
|
14
|
+
const result = [];
|
|
15
|
+
let currentBatch = [];
|
|
16
|
+
for (const p of paragraphs) {
|
|
17
|
+
const candidate = [...currentBatch, p];
|
|
18
|
+
const tokens = estimateTokens(buildPrompt(candidate));
|
|
19
|
+
if (currentBatch.length === 0) {
|
|
20
|
+
if (tokens > chunkSizeTokens) {
|
|
21
|
+
console.warn(`[macos-vision] Paragraph ${p.paragraphId} (page ${p.page}) ` +
|
|
22
|
+
`exceeds chunk budget (~${tokens} est. tokens > ${chunkSizeTokens}). ` +
|
|
23
|
+
`Processing as standalone chunk.`);
|
|
24
|
+
}
|
|
25
|
+
currentBatch = [p];
|
|
26
|
+
}
|
|
27
|
+
else if (tokens <= chunkSizeTokens) {
|
|
28
|
+
currentBatch = candidate;
|
|
29
|
+
}
|
|
30
|
+
else {
|
|
31
|
+
result.push(currentBatch);
|
|
32
|
+
currentBatch = [p];
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
if (currentBatch.length > 0) {
|
|
36
|
+
result.push(currentBatch);
|
|
37
|
+
}
|
|
38
|
+
return result;
|
|
39
|
+
}
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
export { OllamaUnavailableError } from './ollama.js';
|
|
2
|
+
export type { ParagraphGroup } from './prompt.js';
|
|
3
|
+
export interface VisionScribeOptions {
|
|
4
|
+
/**
|
|
5
|
+
* Ollama model name.
|
|
6
|
+
* @default 'mistral-nemo'
|
|
7
|
+
*/
|
|
8
|
+
model?: string;
|
|
9
|
+
/**
|
|
10
|
+
* Base URL of the Ollama server.
|
|
11
|
+
* @default 'http://localhost:11434'
|
|
12
|
+
*/
|
|
13
|
+
ollamaUrl?: string;
|
|
14
|
+
/**
|
|
15
|
+
* Skip the Ollama reachability check before each call.
|
|
16
|
+
* Useful in batch/eval contexts where you ping once upfront.
|
|
17
|
+
* @default false
|
|
18
|
+
*/
|
|
19
|
+
skipPing?: boolean;
|
|
20
|
+
/**
|
|
21
|
+
* Maximum estimated output tokens per LLM chunk.
|
|
22
|
+
* Paragraphs are batched so that no single generate() call is expected
|
|
23
|
+
* to produce more than this many tokens. Lower values mean more (faster)
|
|
24
|
+
* chunks; higher values risk hitting the model's output token limit.
|
|
25
|
+
* @default 1800
|
|
26
|
+
*/
|
|
27
|
+
chunkSizeTokens?: number;
|
|
28
|
+
}
|
|
29
|
+
/**
|
|
30
|
+
* Converts an image or PDF to structured Markdown using a two-stage pipeline:
|
|
31
|
+
*
|
|
32
|
+
* 1. **Apple Vision OCR** — extracts raw text blocks with bounding-box coordinates.
|
|
33
|
+
* PDFs are automatically rasterized page-by-page.
|
|
34
|
+
* 2. **Layout inference** — groups blocks by `paragraphId` per page using spatial
|
|
35
|
+
* heuristics (each page processed independently to avoid coordinate mixing).
|
|
36
|
+
* 3. **Chunking** — paragraphs are batched to stay within the LLM output token budget.
|
|
37
|
+
* 4. **Local LLM (Ollama)** — formats each chunk into clean Markdown without
|
|
38
|
+
* hallucinating new content.
|
|
39
|
+
*
|
|
40
|
+
* @example
|
|
41
|
+
* ```ts
|
|
42
|
+
* const scribe = new VisionScribe({ model: 'mistral-nemo' });
|
|
43
|
+
* const markdown = await scribe.toMarkdown('invoice.png');
|
|
44
|
+
* const mdFromPdf = await scribe.toMarkdown('report.pdf');
|
|
45
|
+
* ```
|
|
46
|
+
*/
|
|
47
|
+
export declare class VisionScribe {
|
|
48
|
+
private readonly model;
|
|
49
|
+
private readonly ollamaUrl;
|
|
50
|
+
private readonly skipPing;
|
|
51
|
+
private readonly chunkSizeTokens;
|
|
52
|
+
constructor(options?: VisionScribeOptions);
|
|
53
|
+
/**
|
|
54
|
+
* Convert an image or PDF file to Markdown.
|
|
55
|
+
*
|
|
56
|
+
* @param imagePath Absolute or relative path to the image or PDF.
|
|
57
|
+
* @returns Markdown string. Empty string if no text was detected.
|
|
58
|
+
* @throws {OllamaUnavailableError} If the Ollama server cannot be reached.
|
|
59
|
+
*/
|
|
60
|
+
toMarkdown(imagePath: string): Promise<string>;
|
|
61
|
+
}
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
import { ocr, inferLayout, sortBlocksByReadingOrder } from '../index.js';
|
|
2
|
+
import { ping, chat } from './ollama.js';
|
|
3
|
+
import { groupByParagraph, buildUserContent, SYSTEM_PROMPT } from './prompt.js';
|
|
4
|
+
import { chunkParagraphs } from './chunker.js';
|
|
5
|
+
export { OllamaUnavailableError } from './ollama.js';
|
|
6
|
+
/**
|
|
7
|
+
* Group raw OCR blocks by their page index.
|
|
8
|
+
* macos-vision attaches a `page` field (0-based) to blocks from PDFs.
|
|
9
|
+
* Single-image blocks have no `page` field and land in page 0.
|
|
10
|
+
*
|
|
11
|
+
* Coordinates in VisionBlock are always page-local (0–1), so blocks from
|
|
12
|
+
* different pages must NOT be passed together to inferLayout().
|
|
13
|
+
*/
|
|
14
|
+
function groupBlocksByPage(blocks) {
|
|
15
|
+
const pages = new Map();
|
|
16
|
+
for (const block of blocks) {
|
|
17
|
+
const page = block.page ?? 0;
|
|
18
|
+
const existing = pages.get(page) ?? [];
|
|
19
|
+
existing.push(block);
|
|
20
|
+
pages.set(page, existing);
|
|
21
|
+
}
|
|
22
|
+
return pages;
|
|
23
|
+
}
|
|
24
|
+
/**
|
|
25
|
+
* Converts an image or PDF to structured Markdown using a two-stage pipeline:
|
|
26
|
+
*
|
|
27
|
+
* 1. **Apple Vision OCR** — extracts raw text blocks with bounding-box coordinates.
|
|
28
|
+
* PDFs are automatically rasterized page-by-page.
|
|
29
|
+
* 2. **Layout inference** — groups blocks by `paragraphId` per page using spatial
|
|
30
|
+
* heuristics (each page processed independently to avoid coordinate mixing).
|
|
31
|
+
* 3. **Chunking** — paragraphs are batched to stay within the LLM output token budget.
|
|
32
|
+
* 4. **Local LLM (Ollama)** — formats each chunk into clean Markdown without
|
|
33
|
+
* hallucinating new content.
|
|
34
|
+
*
|
|
35
|
+
* @example
|
|
36
|
+
* ```ts
|
|
37
|
+
* const scribe = new VisionScribe({ model: 'mistral-nemo' });
|
|
38
|
+
* const markdown = await scribe.toMarkdown('invoice.png');
|
|
39
|
+
* const mdFromPdf = await scribe.toMarkdown('report.pdf');
|
|
40
|
+
* ```
|
|
41
|
+
*/
|
|
42
|
+
export class VisionScribe {
|
|
43
|
+
model;
|
|
44
|
+
ollamaUrl;
|
|
45
|
+
skipPing;
|
|
46
|
+
chunkSizeTokens;
|
|
47
|
+
constructor(options = {}) {
|
|
48
|
+
this.model = options.model ?? 'mistral-nemo';
|
|
49
|
+
this.ollamaUrl = options.ollamaUrl ?? 'http://localhost:11434';
|
|
50
|
+
this.skipPing = options.skipPing ?? false;
|
|
51
|
+
this.chunkSizeTokens = options.chunkSizeTokens ?? 1800;
|
|
52
|
+
}
|
|
53
|
+
/**
|
|
54
|
+
* Convert an image or PDF file to Markdown.
|
|
55
|
+
*
|
|
56
|
+
* @param imagePath Absolute or relative path to the image or PDF.
|
|
57
|
+
* @returns Markdown string. Empty string if no text was detected.
|
|
58
|
+
* @throws {OllamaUnavailableError} If the Ollama server cannot be reached.
|
|
59
|
+
*/
|
|
60
|
+
async toMarkdown(imagePath) {
|
|
61
|
+
// 1. Ensure Ollama is reachable before doing expensive OCR work.
|
|
62
|
+
if (!this.skipPing)
|
|
63
|
+
await ping(this.ollamaUrl);
|
|
64
|
+
// 2. Extract raw OCR blocks via Apple Vision.
|
|
65
|
+
// For PDFs, macos-vision rasterizes each page and adds a `page` field.
|
|
66
|
+
const rawBlocks = await ocr(imagePath, { format: 'blocks' });
|
|
67
|
+
// 3. Split blocks by page — inferLayout() requires page-local coordinates.
|
|
68
|
+
const pageMap = groupBlocksByPage(rawBlocks);
|
|
69
|
+
// 4. Per page: infer layout → sort → group into paragraphs.
|
|
70
|
+
const allParagraphs = [];
|
|
71
|
+
for (const [pageIndex, pageBlocks] of [...pageMap.entries()].sort(([a], [b]) => a - b)) {
|
|
72
|
+
const layoutBlocks = inferLayout({ textBlocks: pageBlocks });
|
|
73
|
+
const sorted = sortBlocksByReadingOrder(layoutBlocks);
|
|
74
|
+
const paragraphs = groupByParagraph(sorted, pageIndex);
|
|
75
|
+
allParagraphs.push(...paragraphs);
|
|
76
|
+
}
|
|
77
|
+
if (allParagraphs.length === 0) {
|
|
78
|
+
return '';
|
|
79
|
+
}
|
|
80
|
+
// 5. Split paragraphs into chunks that fit within the output token budget.
|
|
81
|
+
const chunks = chunkParagraphs(allParagraphs, this.chunkSizeTokens);
|
|
82
|
+
// 6. Send each chunk to the LLM sequentially and join the results.
|
|
83
|
+
// System prompt goes as role:"system", OCR text as role:"user" — this
|
|
84
|
+
// prevents the model from treating instructions as content to summarise.
|
|
85
|
+
const parts = [];
|
|
86
|
+
for (const chunk of chunks) {
|
|
87
|
+
const part = await chat({ baseUrl: this.ollamaUrl, model: this.model }, SYSTEM_PROMPT, buildUserContent(chunk));
|
|
88
|
+
parts.push(part);
|
|
89
|
+
}
|
|
90
|
+
return parts.join('\n\n');
|
|
91
|
+
}
|
|
92
|
+
}
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
export interface OllamaOptions {
|
|
2
|
+
/** Base URL of the Ollama server. Default: `http://localhost:11434` */
|
|
3
|
+
baseUrl: string;
|
|
4
|
+
/** Model name to use for generation. Default: `mistral-nemo` */
|
|
5
|
+
model: string;
|
|
6
|
+
}
|
|
7
|
+
/**
|
|
8
|
+
* Verify that the Ollama server is reachable.
|
|
9
|
+
* Throws {@link OllamaUnavailableError} if the server cannot be contacted.
|
|
10
|
+
*/
|
|
11
|
+
export declare function ping(baseUrl: string): Promise<void>;
|
|
12
|
+
/**
|
|
13
|
+
* Send a chat request to the Ollama /api/chat endpoint.
|
|
14
|
+
* Separating system and user roles significantly reduces hallucination compared
|
|
15
|
+
* to /api/generate where both are concatenated into a single completion string.
|
|
16
|
+
* Uses `stream: false` so the full response arrives in one shot.
|
|
17
|
+
*/
|
|
18
|
+
export declare function chat(opts: OllamaOptions, systemPrompt: string, userContent: string): Promise<string>;
|
|
19
|
+
export declare class OllamaUnavailableError extends Error {
|
|
20
|
+
constructor(url: string);
|
|
21
|
+
}
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Verify that the Ollama server is reachable.
|
|
3
|
+
* Throws {@link OllamaUnavailableError} if the server cannot be contacted.
|
|
4
|
+
*/
|
|
5
|
+
export async function ping(baseUrl) {
|
|
6
|
+
try {
|
|
7
|
+
await fetch(`${baseUrl}/api/tags`, { signal: AbortSignal.timeout(3_000) });
|
|
8
|
+
}
|
|
9
|
+
catch {
|
|
10
|
+
throw new OllamaUnavailableError(baseUrl);
|
|
11
|
+
}
|
|
12
|
+
}
|
|
13
|
+
/**
|
|
14
|
+
* Send a chat request to the Ollama /api/chat endpoint.
|
|
15
|
+
* Separating system and user roles significantly reduces hallucination compared
|
|
16
|
+
* to /api/generate where both are concatenated into a single completion string.
|
|
17
|
+
* Uses `stream: false` so the full response arrives in one shot.
|
|
18
|
+
*/
|
|
19
|
+
export async function chat(opts, systemPrompt, userContent) {
|
|
20
|
+
const res = await fetch(`${opts.baseUrl}/api/chat`, {
|
|
21
|
+
method: 'POST',
|
|
22
|
+
headers: { 'Content-Type': 'application/json' },
|
|
23
|
+
body: JSON.stringify({
|
|
24
|
+
model: opts.model,
|
|
25
|
+
messages: [
|
|
26
|
+
{ role: 'system', content: systemPrompt },
|
|
27
|
+
{ role: 'user', content: userContent },
|
|
28
|
+
],
|
|
29
|
+
stream: false,
|
|
30
|
+
options: {
|
|
31
|
+
temperature: 0, // deterministic — no creative token selection
|
|
32
|
+
top_p: 1, // full vocabulary considered; determinism comes from temperature=0
|
|
33
|
+
num_predict: -1, // no output truncation
|
|
34
|
+
},
|
|
35
|
+
}),
|
|
36
|
+
signal: AbortSignal.timeout(600_000),
|
|
37
|
+
});
|
|
38
|
+
if (!res.ok) {
|
|
39
|
+
throw new Error(`Ollama request failed: ${res.status} ${res.statusText}`);
|
|
40
|
+
}
|
|
41
|
+
const data = (await res.json());
|
|
42
|
+
return data.message.content.trim();
|
|
43
|
+
}
|
|
44
|
+
export class OllamaUnavailableError extends Error {
|
|
45
|
+
constructor(url) {
|
|
46
|
+
super(`Ollama is not reachable at ${url}. ` +
|
|
47
|
+
`Make sure Ollama is running (e.g. \`ollama serve\`) and the URL is correct.`);
|
|
48
|
+
this.name = 'OllamaUnavailableError';
|
|
49
|
+
}
|
|
50
|
+
}
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
import type { LayoutBlock } from '../index.js';
|
|
2
|
+
/**
|
|
3
|
+
* A single paragraph: all TextBlocks sharing the same paragraphId,
|
|
4
|
+
* in reading order (top-to-bottom, left-to-right within each line).
|
|
5
|
+
*/
|
|
6
|
+
export interface ParagraphGroup {
|
|
7
|
+
paragraphId: number;
|
|
8
|
+
/** Average y-coordinate of the first line — used as a spatial hint. */
|
|
9
|
+
y: number;
|
|
10
|
+
lines: string[];
|
|
11
|
+
/** Zero-based page index (always 0 for single images). */
|
|
12
|
+
page: number;
|
|
13
|
+
}
|
|
14
|
+
/**
|
|
15
|
+
* Group sorted layout blocks by paragraphId into ParagraphGroup objects.
|
|
16
|
+
* Non-text blocks (faces, barcodes, etc.) are ignored.
|
|
17
|
+
*/
|
|
18
|
+
export declare function groupByParagraph(blocks: LayoutBlock[], pageIndex?: number): ParagraphGroup[];
|
|
19
|
+
/**
|
|
20
|
+
* The system prompt sent as `role: "system"` in every chat request.
|
|
21
|
+
* Kept separate from user content so the model treats it as hard constraints,
|
|
22
|
+
* not as text to be summarised or analysed.
|
|
23
|
+
*/
|
|
24
|
+
export declare const SYSTEM_PROMPT = "ACT AS A HIGH-FIDELITY DOCUMENT PARSER. Your only goal is to reconstruct the provided OCR data into a structured Markdown document. NEVER skip text. NEVER summarize. Content must be 100% identical to the source.\n\nDO NOT SUMMARIZE.\nTranscribe every single word from the provided OCR data.\nMaintain 1:1 content fidelity. If the source has 5 paragraphs, the output must have 5 paragraphs.\n\nSTRICT OUTPUT: Output ONLY the Markdown representation. No preamble, no \"Summary of key events\", no \"Here is the result\".\n\nFORMATTING RULES:\n- Add # / ## / ### before lines that are clearly headings or titles\n- Add - before items that are clearly list entries\n- Join lines within the same paragraph into flowing prose\n- Preserve blank lines between paragraphs\n- Do NOT wrap output in code fences";
|
|
25
|
+
/**
|
|
26
|
+
* Build the user-facing content block from a list of paragraphs.
|
|
27
|
+
* This is the OCR text that the model will format — no instructions included.
|
|
28
|
+
* Sent as `role: "user"` in the chat request.
|
|
29
|
+
*/
|
|
30
|
+
export declare function buildUserContent(paragraphs: ParagraphGroup[]): string;
|
|
31
|
+
/**
|
|
32
|
+
* Build the combined string used for token estimation in the chunker.
|
|
33
|
+
* Mirrors what will be sent to the model (system + user content).
|
|
34
|
+
*/
|
|
35
|
+
export declare function buildPrompt(paragraphs: ParagraphGroup[]): string;
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Group sorted layout blocks by paragraphId into ParagraphGroup objects.
|
|
3
|
+
* Non-text blocks (faces, barcodes, etc.) are ignored.
|
|
4
|
+
*/
|
|
5
|
+
export function groupByParagraph(blocks, pageIndex = 0) {
|
|
6
|
+
const map = new Map();
|
|
7
|
+
for (const block of blocks) {
|
|
8
|
+
if (block.kind !== 'text')
|
|
9
|
+
continue;
|
|
10
|
+
if (!map.has(block.paragraphId)) {
|
|
11
|
+
map.set(block.paragraphId, { y: block.y, lines: new Map() });
|
|
12
|
+
}
|
|
13
|
+
const para = map.get(block.paragraphId);
|
|
14
|
+
const existing = para.lines.get(block.lineId) ?? [];
|
|
15
|
+
existing.push(block.text);
|
|
16
|
+
para.lines.set(block.lineId, existing);
|
|
17
|
+
}
|
|
18
|
+
const groups = [];
|
|
19
|
+
for (const [paragraphId, { y, lines }] of map) {
|
|
20
|
+
// Join tokens within each line, then collect lines in order
|
|
21
|
+
const lineStrings = [...lines.entries()]
|
|
22
|
+
.sort(([a], [b]) => a - b)
|
|
23
|
+
.map(([, tokens]) => tokens.join(' '));
|
|
24
|
+
groups.push({ paragraphId, y, lines: lineStrings, page: pageIndex });
|
|
25
|
+
}
|
|
26
|
+
return groups;
|
|
27
|
+
}
|
|
28
|
+
/**
|
|
29
|
+
* The system prompt sent as `role: "system"` in every chat request.
|
|
30
|
+
* Kept separate from user content so the model treats it as hard constraints,
|
|
31
|
+
* not as text to be summarised or analysed.
|
|
32
|
+
*/
|
|
33
|
+
export const SYSTEM_PROMPT = `ACT AS A HIGH-FIDELITY DOCUMENT PARSER. \
|
|
34
|
+
Your only goal is to reconstruct the provided OCR data into a structured \
|
|
35
|
+
Markdown document. NEVER skip text. NEVER summarize. \
|
|
36
|
+
Content must be 100% identical to the source.
|
|
37
|
+
|
|
38
|
+
DO NOT SUMMARIZE.
|
|
39
|
+
Transcribe every single word from the provided OCR data.
|
|
40
|
+
Maintain 1:1 content fidelity. If the source has 5 paragraphs, the output must have 5 paragraphs.
|
|
41
|
+
|
|
42
|
+
STRICT OUTPUT: Output ONLY the Markdown representation. \
|
|
43
|
+
No preamble, no "Summary of key events", no "Here is the result".
|
|
44
|
+
|
|
45
|
+
FORMATTING RULES:
|
|
46
|
+
- Add # / ## / ### before lines that are clearly headings or titles
|
|
47
|
+
- Add - before items that are clearly list entries
|
|
48
|
+
- Join lines within the same paragraph into flowing prose
|
|
49
|
+
- Preserve blank lines between paragraphs
|
|
50
|
+
- Do NOT wrap output in code fences`;
|
|
51
|
+
/**
|
|
52
|
+
* Build the user-facing content block from a list of paragraphs.
|
|
53
|
+
* This is the OCR text that the model will format — no instructions included.
|
|
54
|
+
* Sent as `role: "user"` in the chat request.
|
|
55
|
+
*/
|
|
56
|
+
export function buildUserContent(paragraphs) {
|
|
57
|
+
const pageNumbers = [...new Set(paragraphs.map(p => p.page))].sort((a, b) => a - b);
|
|
58
|
+
const multiPage = pageNumbers.length > 1;
|
|
59
|
+
const blocks = [];
|
|
60
|
+
for (const pageNum of pageNumbers) {
|
|
61
|
+
if (multiPage) {
|
|
62
|
+
blocks.push(`[Page ${pageNum + 1}]`);
|
|
63
|
+
}
|
|
64
|
+
const pageParagraphs = paragraphs.filter(p => p.page === pageNum);
|
|
65
|
+
for (const { paragraphId, y, lines } of pageParagraphs) {
|
|
66
|
+
const yHint = y.toFixed(2);
|
|
67
|
+
const header = `[Paragraph ${paragraphId}, y≈${yHint}]`;
|
|
68
|
+
blocks.push(`${header}\n${lines.join('\n')}`);
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
const task = 'Convert the OCR source below into Markdown. ' +
|
|
72
|
+
'Reproduce EVERY word EXACTLY. Do not respond, explain, or ask questions.\n\n' +
|
|
73
|
+
'<ocr_source>';
|
|
74
|
+
return `${task}\n\n${blocks.join('\n\n')}\n</ocr_source>`;
|
|
75
|
+
}
|
|
76
|
+
/**
|
|
77
|
+
* Build the combined string used for token estimation in the chunker.
|
|
78
|
+
* Mirrors what will be sent to the model (system + user content).
|
|
79
|
+
*/
|
|
80
|
+
export function buildPrompt(paragraphs) {
|
|
81
|
+
return `${SYSTEM_PROMPT}\n\n${buildUserContent(paragraphs)}`;
|
|
82
|
+
}
|
package/package.json
CHANGED
|
@@ -1,15 +1,33 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "macos-vision",
|
|
3
|
-
"version": "1.
|
|
4
|
-
"description": "Apple Vision OCR
|
|
3
|
+
"version": "1.3.0",
|
|
4
|
+
"description": "Apple Vision OCR + image/PDF analysis for Node.js, with optional Ollama-driven Markdown pipeline — native, fast, offline",
|
|
5
5
|
"author": "Adrian Wolczuk",
|
|
6
6
|
"license": "MIT",
|
|
7
7
|
"type": "module",
|
|
8
8
|
"main": "./dist/index.js",
|
|
9
9
|
"types": "./dist/index.d.ts",
|
|
10
10
|
"bin": {
|
|
11
|
-
"macos-vision": "
|
|
11
|
+
"macos-vision": "dist/cli.js"
|
|
12
12
|
},
|
|
13
|
+
"exports": {
|
|
14
|
+
".": {
|
|
15
|
+
"import": "./dist/index.js",
|
|
16
|
+
"types": "./dist/index.d.ts"
|
|
17
|
+
},
|
|
18
|
+
"./markdown": {
|
|
19
|
+
"import": "./dist/markdown/index.js",
|
|
20
|
+
"types": "./dist/markdown/index.d.ts"
|
|
21
|
+
}
|
|
22
|
+
},
|
|
23
|
+
"files": [
|
|
24
|
+
"dist",
|
|
25
|
+
"bin",
|
|
26
|
+
"scripts/build-native.js",
|
|
27
|
+
"src/native",
|
|
28
|
+
"README.md",
|
|
29
|
+
"LICENSE"
|
|
30
|
+
],
|
|
13
31
|
"repository": {
|
|
14
32
|
"type": "git",
|
|
15
33
|
"url": "git+https://github.com/woladi/macos-vision.git"
|
|
@@ -25,7 +43,8 @@
|
|
|
25
43
|
"lint": "eslint src/**/*.ts",
|
|
26
44
|
"format": "prettier --write src/**/*.ts",
|
|
27
45
|
"release": "release-it",
|
|
28
|
-
"release:beta": "release-it --preRelease=beta"
|
|
46
|
+
"release:beta": "release-it --preRelease=beta",
|
|
47
|
+
"eval:setup": "git clone https://github.com/opendataloader-project/opendataloader-bench eval/bench || echo 'bench already cloned'"
|
|
29
48
|
},
|
|
30
49
|
"keywords": [
|
|
31
50
|
"ocr",
|
|
@@ -40,7 +59,13 @@
|
|
|
40
59
|
"barcode",
|
|
41
60
|
"qr-code",
|
|
42
61
|
"document-detection",
|
|
43
|
-
"image-classification"
|
|
62
|
+
"image-classification",
|
|
63
|
+
"markdown",
|
|
64
|
+
"image-to-markdown",
|
|
65
|
+
"pdf-to-markdown",
|
|
66
|
+
"ollama",
|
|
67
|
+
"llm",
|
|
68
|
+
"document-pipeline"
|
|
44
69
|
],
|
|
45
70
|
"lint-staged": {
|
|
46
71
|
"src/**/*.ts": [
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
import PDFKit
|
|
2
|
+
import AppKit
|
|
3
|
+
import Foundation
|
|
4
|
+
|
|
5
|
+
// ─── Result struct ────────────────────────────────────────────────────────────
|
|
6
|
+
|
|
7
|
+
struct PageResult: Codable {
|
|
8
|
+
let page: Int // 0-based
|
|
9
|
+
let path: String
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
// ─── Helpers ──────────────────────────────────────────────────────────────────
|
|
13
|
+
|
|
14
|
+
func fail(_ message: String) -> Never {
|
|
15
|
+
fputs("ERROR: \(message)\n", stderr)
|
|
16
|
+
exit(1)
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
func encodeJSON<T: Encodable>(_ value: T) -> String {
|
|
20
|
+
guard let data = try? JSONEncoder().encode(value),
|
|
21
|
+
let str = String(data: data, encoding: .utf8) else { return "[]" }
|
|
22
|
+
return str
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
// ─── Argument parsing ─────────────────────────────────────────────────────────
|
|
26
|
+
|
|
27
|
+
let args = CommandLine.arguments
|
|
28
|
+
guard args.count >= 2 else {
|
|
29
|
+
fail("Usage: pdf-helper <path-to-pdf>")
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
let pdfPath = args[1]
|
|
33
|
+
let pdfURL = URL(fileURLWithPath: pdfPath)
|
|
34
|
+
|
|
35
|
+
guard let pdf = PDFDocument(url: pdfURL) else {
|
|
36
|
+
fail("Cannot open PDF: \(pdfPath)")
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
let pageCount = pdf.pageCount
|
|
40
|
+
guard pageCount > 0 else {
|
|
41
|
+
fail("PDF has no pages: \(pdfPath)")
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
// ─── Output directory: ~/.cache/macos-vision/{basename}-{uuid}/ ───────────────
|
|
45
|
+
|
|
46
|
+
let basename = pdfURL.deletingPathExtension().lastPathComponent
|
|
47
|
+
let uuid = UUID().uuidString.lowercased()
|
|
48
|
+
let cacheBase = FileManager.default.homeDirectoryForCurrentUser
|
|
49
|
+
.appendingPathComponent(".cache/macos-vision")
|
|
50
|
+
let outDir = cacheBase.appendingPathComponent("\(basename)-\(uuid)")
|
|
51
|
+
|
|
52
|
+
do {
|
|
53
|
+
try FileManager.default.createDirectory(at: outDir, withIntermediateDirectories: true)
|
|
54
|
+
} catch {
|
|
55
|
+
fail("Cannot create output directory \(outDir.path): \(error.localizedDescription)")
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
// ─── Rasterize each page at 300 DPI ──────────────────────────────────────────
|
|
59
|
+
|
|
60
|
+
// PDF points are 72 pt/inch. Scale factor for 300 DPI = 300/72 ≈ 4.167
|
|
61
|
+
let scale: CGFloat = 300.0 / 72.0
|
|
62
|
+
|
|
63
|
+
var results: [PageResult] = []
|
|
64
|
+
|
|
65
|
+
for pageIndex in 0..<pageCount {
|
|
66
|
+
guard let page = pdf.page(at: pageIndex) else {
|
|
67
|
+
fail("Cannot access page \(pageIndex) of \(pdfPath)")
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
let mediaBox = page.bounds(for: .mediaBox)
|
|
71
|
+
let width = Int((mediaBox.width * scale).rounded())
|
|
72
|
+
let height = Int((mediaBox.height * scale).rounded())
|
|
73
|
+
|
|
74
|
+
guard let bitmapRep = NSBitmapImageRep(
|
|
75
|
+
bitmapDataPlanes: nil,
|
|
76
|
+
pixelsWide: width,
|
|
77
|
+
pixelsHigh: height,
|
|
78
|
+
bitsPerSample: 8,
|
|
79
|
+
samplesPerPixel: 4,
|
|
80
|
+
hasAlpha: true,
|
|
81
|
+
isPlanar: false,
|
|
82
|
+
colorSpaceName: .calibratedRGB,
|
|
83
|
+
bytesPerRow: 0,
|
|
84
|
+
bitsPerPixel: 0
|
|
85
|
+
) else {
|
|
86
|
+
fail("Cannot create bitmap for page \(pageIndex)")
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
guard let ctx = NSGraphicsContext(bitmapImageRep: bitmapRep) else {
|
|
90
|
+
fail("Cannot create graphics context for page \(pageIndex)")
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
// Fill white background (PDFs are transparent by default)
|
|
94
|
+
NSGraphicsContext.saveGraphicsState()
|
|
95
|
+
NSGraphicsContext.current = ctx
|
|
96
|
+
NSColor.white.setFill()
|
|
97
|
+
NSRect(x: 0, y: 0, width: width, height: height).fill()
|
|
98
|
+
|
|
99
|
+
ctx.cgContext.scaleBy(x: scale, y: scale)
|
|
100
|
+
page.draw(with: .mediaBox, to: ctx.cgContext)
|
|
101
|
+
NSGraphicsContext.restoreGraphicsState()
|
|
102
|
+
|
|
103
|
+
guard let pngData = bitmapRep.representation(using: .png, properties: [:]) else {
|
|
104
|
+
fail("Cannot encode page \(pageIndex) to PNG")
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
// Zero-pad page number to 3 digits: page-001.png, page-002.png, …
|
|
108
|
+
let filename = String(format: "%@-page-%03d.png", basename, pageIndex + 1)
|
|
109
|
+
let outPath = outDir.appendingPathComponent(filename)
|
|
110
|
+
|
|
111
|
+
do {
|
|
112
|
+
try pngData.write(to: outPath)
|
|
113
|
+
} catch {
|
|
114
|
+
fail("Cannot write \(outPath.path): \(error.localizedDescription)")
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
results.append(PageResult(page: pageIndex, path: outPath.path))
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
// ─── Output JSON ──────────────────────────────────────────────────────────────
|
|
121
|
+
|
|
122
|
+
print(encodeJSON(results))
|