monocr 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md ADDED
@@ -0,0 +1,115 @@
1
+ # monocr (JavaScript/Node.js)
2
+
3
+ Mon language OCR using ONNX Runtime for Node.js applications.
4
+
5
+ ## Installation
6
+
7
+ ```bash
8
+ npm install monocr
9
+ ```
10
+
11
+ ## Quick Start
12
+
13
+ ```javascript
14
+ const { read_image } = require("monocr");
15
+
16
+ const text = await read_image("path/to/image.jpg", "model.onnx", "charset.txt");
17
+ console.log(text);
18
+ ```
19
+
20
+ ## API
21
+
22
+ ### read_image(imagePath, modelPath, charsetPath)
23
+
24
+ Recognize text from an image file.
25
+
26
+ **Parameters:**
27
+
28
+ - `imagePath` - Path to image file (jpg, png)
29
+ - `modelPath` - Path to ONNX model (optional, defaults to `../model/monocr.onnx`)
30
+ - `charsetPath` - Path to charset file (optional, defaults to `../model/charset.txt`)
31
+
32
+ **Returns:** `Promise<string>` - Recognized text
33
+
34
+ ### read_pdf(pdfPath, modelPath, charsetPath)
35
+
36
+ Recognize text from a PDF file.
37
+
38
+ **Parameters:**
39
+
40
+ - `pdfPath` - Path to PDF file
41
+ - `modelPath` - Path to ONNX model (optional)
42
+ - `charsetPath` - Path to charset file (optional)
43
+
44
+ **Returns:** `Promise<string[]>` - Array of text per page
45
+
46
+ ### read_image_with_accuracy(imagePath, groundTruth, modelPath, charsetPath)
47
+
48
+ Recognize text with accuracy measurement.
49
+
50
+ **Parameters:**
51
+
52
+ - `imagePath` - Path to image file
53
+ - `groundTruth` - Expected text for accuracy calculation
54
+ - `modelPath` - Path to ONNX model (optional)
55
+ - `charsetPath` - Path to charset file (optional)
56
+
57
+ **Returns:** `Promise<{text: string, accuracy: number}>` - Text and accuracy percentage
58
+
59
+ ### MonOCR Class
60
+
61
+ For advanced usage, use the `MonOCR` class directly:
62
+
63
+ ```javascript
64
+ const { MonOCR } = require("monocr");
65
+
66
+ const ocr = new MonOCR("model.onnx", "charset.txt");
67
+ await ocr.init();
68
+
69
+ // Single line
70
+ const text = await ocr.predictLine(imageSource);
71
+
72
+ // Full page (with line segmentation)
73
+ const results = await ocr.predictPage(imagePath);
74
+ ```
75
+
76
+ ## CLI
77
+
78
+ The package includes a command-line tool:
79
+
80
+ ```bash
81
+ # Single image
82
+ monocr image path/to/image.jpg
83
+
84
+ # PDF файл
85
+ monocr pdf path/to/document.pdf
86
+
87
+ # Batch processing
88
+ monocr batch path/to/images/ -o results.json
89
+ ```
90
+
91
+ ## Examples
92
+
93
+ See the `examples/` directory for detailed usage examples:
94
+
95
+ - `simple.js` - Basic image OCR
96
+ - `with-accuracy.js` - OCR with accuracy measurement
97
+ - `batch.js` - Batch processing
98
+
99
+ Run examples:
100
+
101
+ ```bash
102
+ node examples/simple.js
103
+ ```
104
+
105
+ ## Model Files
106
+
107
+ This package requires the ONNX model and charset files. Download them from:
108
+
109
+ - [GitHub Releases](https://github.com/janakh/monocr-onnx/releases)
110
+
111
+ Place them in a `model/` directory or specify custom paths in the API calls.
112
+
113
+ ## License
114
+
115
+ MIT
package/bin/monocr.js ADDED
@@ -0,0 +1,85 @@
1
+ #!/usr/bin/env node
2
+
3
+ const { program } = require('commander');
4
+ const { read_image, read_pdf } = require('../src/index');
5
+ const fs = require('fs');
6
+ const path = require('path');
7
+
8
+ program
9
+ .name('monocr')
10
+ .description('Mon language OCR using ONNX Runtime')
11
+ .version('0.1.0');
12
+
13
+ program
14
+ .command('image <path>')
15
+ .description('Recognize text from an image file')
16
+ .option('-m, --model <path>', 'Path to ONNX model', '../model/monocr.onnx')
17
+ .option('-c, --charset <path>', 'Path to charset file', '../model/charset.txt')
18
+ .action(async (imagePath, options) => {
19
+ try {
20
+ const text = await read_image(imagePath, options.model, options.charset);
21
+ console.log(text);
22
+ } catch (err) {
23
+ console.error('Error:', err.message);
24
+ process.exit(1);
25
+ }
26
+ });
27
+
28
+ program
29
+ .command('pdf <path>')
30
+ .description('Recognize text from a PDF file')
31
+ .option('-m, --model <path>', 'Path to ONNX model', '../model/monocr.onnx')
32
+ .option('-c, --charset <path>', 'Path to charset file', '../model/charset.txt')
33
+ .action(async (pdfPath, options) => {
34
+ try {
35
+ const pages = await read_pdf(pdfPath, options.model, options.charset);
36
+ pages.forEach((pageText, i) => {
37
+ console.log(`--- Page ${i + 1} ---`);
38
+ console.log(pageText);
39
+ console.log();
40
+ });
41
+ } catch (err) {
42
+ console.error('Error:', err.message);
43
+ process.exit(1);
44
+ }
45
+ });
46
+
47
+ program
48
+ .command('batch <directory>')
49
+ .description('Process all images in a directory')
50
+ .option('-m, --model <path>', 'Path to ONNX model', '../model/monocr.onnx')
51
+ .option('-c, --charset <path>', 'Path to charset file', '../model/charset.txt')
52
+ .option('-o, --output <path>', 'Output file for results (optional)')
53
+ .action(async (directory, options) => {
54
+ try {
55
+ const files = fs.readdirSync(directory)
56
+ .filter(f => /\.(jpg|jpeg|png)$/i.test(f))
57
+ .sort();
58
+
59
+ const results = [];
60
+
61
+ for (const file of files) {
62
+ const filePath = path.join(directory, file);
63
+ console.error(`Processing: ${file}...`);
64
+
65
+ try {
66
+ const text = await read_image(filePath, options.model, options.charset);
67
+ results.push({ file, text, success: true });
68
+ } catch (err) {
69
+ results.push({ file, error: err.message, success: false });
70
+ }
71
+ }
72
+
73
+ if (options.output) {
74
+ fs.writeFileSync(options.output, JSON.stringify(results, null, 2));
75
+ console.error(`Results written to ${options.output}`);
76
+ } else {
77
+ console.log(JSON.stringify(results, null, 2));
78
+ }
79
+ } catch (err) {
80
+ console.error('Error:', err.message);
81
+ process.exit(1);
82
+ }
83
+ });
84
+
85
+ program.parse();
package/package.json ADDED
@@ -0,0 +1,42 @@
1
+ {
2
+ "name": "monocr",
3
+ "version": "0.1.1",
4
+ "description": "Cross-platform Mon (mnw) language OCR using ONNX Runtime. Supports Node.js.",
5
+ "main": "src/index.js",
6
+ "bin": {
7
+ "monocr": "bin/monocr.js"
8
+ },
9
+ "repository": {
10
+ "type": "git",
11
+ "url": "git+https://github.com/janakh/monocr-onnx.git",
12
+ "directory": "js"
13
+ },
14
+ "keywords": [
15
+ "ocr",
16
+ "mon",
17
+ "mnw",
18
+ "onnx",
19
+ "machine-learning"
20
+ ],
21
+ "author": "Janakh",
22
+ "license": "MIT",
23
+ "bugs": {
24
+ "url": "https://github.com/janakh/monocr-onnx/issues"
25
+ },
26
+ "homepage": "https://github.com/janakh/monocr-onnx/tree/main/js#readme",
27
+ "files": [
28
+ "src/",
29
+ "bin/",
30
+ "README.md",
31
+ "LICENSE"
32
+ ],
33
+ "dependencies": {
34
+ "commander": "^11.1.0",
35
+ "onnxruntime-node": "^1.15.0",
36
+ "pdf2pic": "^3.2.0",
37
+ "sharp": "^0.32.0"
38
+ },
39
+ "publishConfig": {
40
+ "access": "public"
41
+ }
42
+ }
package/src/index.js ADDED
@@ -0,0 +1,87 @@
1
+ const MonOCR = require('./monocr');
2
+ const { calculateAccuracy } = require('./utils');
3
+
4
+ module.exports = {
5
+ MonOCR,
6
+ calculateAccuracy,
7
+ read_image,
8
+ read_pdf,
9
+ read_image_with_accuracy
10
+ };
11
+
12
+ /**
13
+ * Read text from an image file
14
+ * @param {string} imagePath - Path to image file
15
+ * @param {string} modelPath - Path to ONNX model
16
+ * @param {string} charsetPath - Path to charset file
17
+ * @returns {Promise<string>} Recognized text
18
+ */
19
+ async function read_image(imagePath, modelPath = '../model/monocr.onnx', charsetPath = '../model/charset.txt') {
20
+ const ocr = new MonOCR(modelPath, charsetPath);
21
+ await ocr.init();
22
+ const results = await ocr.predictPage(imagePath);
23
+ return results.map(r => r.text).join('\n');
24
+ }
25
+
26
+ /**
27
+ * Read text from a PDF file
28
+ * @param {string} pdfPath - Path to PDF file
29
+ * @param {string} modelPath - Path to ONNX model
30
+ * @param {string} charsetPath - Path to charset file
31
+ * @returns {Promise<string[]>} Array of text per page
32
+ */
33
+ async function read_pdf(pdfPath, modelPath = '../model/monocr.onnx', charsetPath = '../model/charset.txt') {
34
+ const { fromPath } = require('pdf2pic');
35
+ const path = require('path');
36
+ const fs = require('fs');
37
+ const os = require('os');
38
+
39
+ const ocr = new MonOCR(modelPath, charsetPath);
40
+ await ocr.init();
41
+
42
+ const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'monocr-'));
43
+
44
+ const converter = fromPath(pdfPath, {
45
+ density: 300,
46
+ format: 'png',
47
+ width: 2480,
48
+ height: 3508,
49
+ saveFilename: 'page',
50
+ savePath: tempDir
51
+ });
52
+
53
+ const pages = [];
54
+ let pageNum = 1;
55
+
56
+ while (true) {
57
+ try {
58
+ const result = await converter(pageNum, { responseType: 'image' });
59
+ const imagePath = result.path;
60
+ const results = await ocr.predictPage(imagePath);
61
+ const pageText = results.map(r => r.text).join('\n');
62
+ pages.push(pageText);
63
+ pageNum++;
64
+ } catch (err) {
65
+ break;
66
+ }
67
+ }
68
+
69
+ // Cleanup temp directory
70
+ fs.rmSync(tempDir, { recursive: true, force: true });
71
+
72
+ return pages;
73
+ }
74
+
75
+ /**
76
+ * Read text from an image with accuracy measurement
77
+ * @param {string} imagePath - Path to image file
78
+ * @param {string} groundTruth - Expected text for accuracy calculation
79
+ * @param {string} modelPath - Path to ONNX model
80
+ * @param {string} charsetPath - Path to charset file
81
+ * @returns {Promise<{text: string, accuracy: number}>}
82
+ */
83
+ async function read_image_with_accuracy(imagePath, groundTruth, modelPath = '../model/monocr.onnx', charsetPath = '../model/charset.txt') {
84
+ const text = await read_image(imagePath, modelPath, charsetPath);
85
+ const accuracy = calculateAccuracy(text, groundTruth);
86
+ return { text, accuracy: parseFloat(accuracy) };
87
+ }
package/src/monocr.js ADDED
@@ -0,0 +1,153 @@
1
+ const ort = require('onnxruntime-node');
2
+ const sharp = require('sharp');
3
+ const fs = require('fs');
4
+ const LineSegmenter = require('./segmenter');
5
+
6
+ class MonOCR {
7
+ constructor(modelPath, charsetPath) {
8
+ this.modelPath = modelPath;
9
+ this.charsetPath = charsetPath;
10
+ this.session = null;
11
+ this.charset = "";
12
+ this.segmenter = new LineSegmenter();
13
+
14
+ // Metadata
15
+ this.targetHeight = 64;
16
+ this.targetWidth = 1024;
17
+ }
18
+
19
+ async init() {
20
+ if (this.session) return;
21
+ this.session = await ort.InferenceSession.create(this.modelPath);
22
+ if (this.charsetPath) {
23
+ this.charset = fs.readFileSync(this.charsetPath, 'utf-8').trim();
24
+ }
25
+ }
26
+
27
+ /**
28
+ * Replicates Python's resize_and_pad:
29
+ * 1. Resize height to 64, maintain aspect ratio.
30
+ * 2. Pad width to 1024 (white background).
31
+ * 3. Normalize to [-1, 1].
32
+ */
33
+ async preprocess(imageSource) {
34
+ let sharpImg;
35
+ if (typeof imageSource.metadata === 'function') {
36
+ sharpImg = imageSource;
37
+ } else {
38
+ sharpImg = sharp(imageSource);
39
+ }
40
+ const metadata = await sharpImg.metadata();
41
+
42
+ const scale = this.targetHeight / metadata.height;
43
+ const newWidth = Math.min(this.targetWidth, Math.round(metadata.width * scale));
44
+
45
+ // Create the grayscale resized image
46
+ const resizedBuffer = await sharpImg
47
+ .grayscale()
48
+ .resize({
49
+ height: this.targetHeight,
50
+ width: newWidth,
51
+ fit: 'fill'
52
+ })
53
+ .raw()
54
+ .toBuffer();
55
+
56
+ // Create target canvas (1024 width, white background = 255)
57
+ const totalSize = this.targetHeight * this.targetWidth;
58
+ const canvas = new Float32Array(totalSize).fill(1.0);
59
+
60
+ // Fill canvas with resized image and normalize
61
+ // Python: canvas = canvas.astype(np.float32) / 127.5 - 1.0
62
+ // 255 -> 1.0
63
+ // 0 -> -1.0
64
+
65
+ for (let y = 0; y < this.targetHeight; y++) {
66
+ for (let x = 0; x < this.targetWidth; x++) {
67
+ const canvasIdx = y * this.targetWidth + x;
68
+ if (x < newWidth) {
69
+ const imgIdx = y * newWidth + x;
70
+ const pixelValue = resizedBuffer[imgIdx];
71
+ canvas[canvasIdx] = (pixelValue / 127.5) - 1.0;
72
+ } else {
73
+ // Padding is white
74
+ canvas[canvasIdx] = (255 / 127.5) - 1.0; // 1.0
75
+ }
76
+ }
77
+ }
78
+
79
+ return new ort.Tensor('float32', canvas, [1, 1, this.targetHeight, this.targetWidth]);
80
+ }
81
+
82
+ /**
83
+ * CTC Greedy Decoding
84
+ * Ignores blank (0) and contracts repeats.
85
+ */
86
+ decode(outputTensor) {
87
+ const data = outputTensor.data;
88
+ const dims = outputTensor.dims; // [Batch, Time, Classes]
89
+ const numClasses = dims[2];
90
+ const sequenceLength = dims[1];
91
+
92
+ const idx2char = {};
93
+ for (let i = 0; i < this.charset.length; i++) {
94
+ idx2char[i + 1] = this.charset[i];
95
+ }
96
+
97
+ let decodedText = "";
98
+ let prevIdx = -1;
99
+
100
+ for (let t = 0; t < sequenceLength; t++) {
101
+ let maxVal = -Infinity;
102
+ let maxIdx = 0;
103
+ for (let c = 0; c < numClasses; c++) {
104
+ const val = data[t * numClasses + c];
105
+ if (val > maxVal) {
106
+ maxVal = val;
107
+ maxIdx = c;
108
+ }
109
+ }
110
+
111
+ // CTC logic: 0 is blank, ignore repeats
112
+ if (maxIdx !== 0 && maxIdx !== prevIdx) {
113
+ decodedText += idx2char[maxIdx] || "";
114
+ }
115
+ prevIdx = maxIdx;
116
+ }
117
+
118
+ return decodedText;
119
+ }
120
+
121
+ async predictLine(imageSource) {
122
+ if (!this.session) await this.init();
123
+
124
+ const inputTensor = await this.preprocess(imageSource);
125
+ const feeds = {};
126
+ feeds[this.session.inputNames[0]] = inputTensor;
127
+
128
+ const results = await this.session.run(feeds);
129
+ const outputTensor = results[this.session.outputNames[0]];
130
+
131
+ return this.decode(outputTensor);
132
+ }
133
+
134
+ /**
135
+ * Processes full page: segments into lines and predicts each.
136
+ */
137
+ async predictPage(imagePath) {
138
+ const lines = await this.segmenter.segment(imagePath);
139
+ const results = [];
140
+
141
+ for (const line of lines) {
142
+ const text = await this.predictLine(line.img);
143
+ results.push({
144
+ text,
145
+ bbox: line.bbox
146
+ });
147
+ }
148
+
149
+ return results;
150
+ }
151
+ }
152
+
153
+ module.exports = MonOCR;
@@ -0,0 +1,136 @@
1
+ const sharp = require('sharp');
2
+
3
+ class LineSegmenter {
4
+ /**
5
+ * @param {number} minLineH Minimum height of a line to be considered valid.
6
+ * @param {number} smoothWindow Smoothing window for projection profile.
7
+ */
8
+ constructor(minLineH = 10, smoothWindow = 3) {
9
+ this.minLineH = minLineH;
10
+ this.smoothWindow = smoothWindow;
11
+ }
12
+
13
+ /**
14
+ * Segment a document image into text lines.
15
+ * @param {string|Buffer} imagePath Path to image or Buffer.
16
+ * @returns {Promise<Array<{img: sharp.Sharp, bbox: {x: number, y: number, w: number, h: number}}>>}
17
+ */
18
+ async segment(imagePath) {
19
+ const image = sharp(imagePath);
20
+ const { width, height } = await image.metadata();
21
+
22
+ // 1. Get raw grayscale data for thresholding
23
+ const grayBuffer = await image
24
+ .grayscale()
25
+ .raw()
26
+ .toBuffer();
27
+
28
+ // 2. Simple Adaptive-ish Thresholding
29
+ // Since we don't have CV2's adaptiveThreshold easily, we'll do a simple threshold
30
+ // or just use sharp's threshold if we can get the mask.
31
+ // Actually, to replicate Horizontal Projection, we need the sum of "text" pixels.
32
+ // We'll treat dark pixels (< 128) as text (since background is white).
33
+ const binary = new Uint8Array(grayBuffer.length);
34
+ const hist = new Float32Array(height).fill(0);
35
+
36
+ for (let y = 0; y < height; y++) {
37
+ for (let x = 0; x < width; x++) {
38
+ const idx = y * width + x;
39
+ // Threshold: 128 is a safe bet for black text on white paper.
40
+ // Inverted so text is "high" (1) and background is 0.
41
+ if (grayBuffer[idx] < 128) {
42
+ binary[idx] = 1;
43
+ hist[y]++;
44
+ } else {
45
+ binary[idx] = 0;
46
+ }
47
+ }
48
+ }
49
+
50
+ // 3. Smoothing projection profile
51
+ let smoothedHist = hist;
52
+ if (this.smoothWindow > 1) {
53
+ smoothedHist = new Float32Array(height);
54
+ const half = Math.floor(this.smoothWindow / 2);
55
+ for (let i = 0; i < height; i++) {
56
+ let sum = 0;
57
+ let count = 0;
58
+ for (let j = i - half; j <= i + half; j++) {
59
+ if (j >= 0 && j < height) {
60
+ sum += hist[j];
61
+ count++;
62
+ }
63
+ }
64
+ smoothedHist[i] = sum / count;
65
+ }
66
+ }
67
+
68
+ // 4. Gap Detection
69
+ const nonZeroVals = smoothedHist.filter(v => v > 0);
70
+ if (nonZeroVals.length === 0) return [];
71
+
72
+ const meanDensity = nonZeroVals.reduce((a, b) => a + b, 0) / nonZeroVals.length;
73
+ const gapThreshold = meanDensity * 0.05;
74
+
75
+ const results = [];
76
+ let start = null;
77
+
78
+ for (let y = 0; y < height; y++) {
79
+ const isText = smoothedHist[y] > gapThreshold;
80
+ if (isText && start === null) {
81
+ start = y;
82
+ } else if (!isText && start !== null) {
83
+ const end = y;
84
+ if (end - start >= this.minLineH) {
85
+ await this._extractLine(image, grayBuffer, width, height, start, end, results);
86
+ }
87
+ start = null;
88
+ }
89
+ }
90
+
91
+ if (start !== null && (height - start) >= this.minLineH) {
92
+ await this._extractLine(image, grayBuffer, width, height, start, height, results);
93
+ }
94
+
95
+ return results;
96
+ }
97
+
98
+ async _extractLine(image, grayBuffer, width, height, rStart, rEnd, results) {
99
+ // Find horizontal bounds within this vertical strip
100
+ let xMin = width;
101
+ let xMax = 0;
102
+ let hasPixels = false;
103
+
104
+ for (let y = rStart; y < rEnd; y++) {
105
+ for (let x = 0; x < width; x++) {
106
+ if (grayBuffer[y * width + x] < 128) {
107
+ if (x < xMin) xMin = x;
108
+ if (x > xMax) xMax = x;
109
+ hasPixels = true;
110
+ }
111
+ }
112
+ }
113
+
114
+ if (!hasPixels) return;
115
+
116
+ // Add padding
117
+ const pad = 4;
118
+ const y1 = Math.max(0, rStart - pad);
119
+ const y2 = Math.min(height, rEnd + pad);
120
+ const x1 = Math.max(0, xMin - pad);
121
+ const x2 = Math.min(width, xMax + pad);
122
+
123
+ const w = x2 - x1;
124
+ const h = y2 - y1;
125
+
126
+ // Crop the line
127
+ const crop = image.clone().extract({ left: x1, top: y1, width: w, height: h });
128
+
129
+ results.push({
130
+ img: crop,
131
+ bbox: { x: x1, y: y1, w, h }
132
+ });
133
+ }
134
+ }
135
+
136
+ module.exports = LineSegmenter;
package/src/utils.js ADDED
@@ -0,0 +1,25 @@
1
+ /**
2
+ * Calculate character error rate between predicted and ground truth text
3
+ * @param {string} predicted - Predicted text
4
+ * @param {string} groundTruth - Ground truth text
5
+ * @returns {number} Accuracy percentage (0-100)
6
+ */
7
+ function calculateAccuracy(predicted, groundTruth) {
8
+ if (!groundTruth) return 0;
9
+
10
+ const len = Math.max(predicted.length, groundTruth.length);
11
+ if (len === 0) return 100;
12
+
13
+ let errors = 0;
14
+ for (let i = 0; i < len; i++) {
15
+ if (predicted[i] !== groundTruth[i]) {
16
+ errors++;
17
+ }
18
+ }
19
+
20
+ return ((1 - errors / len) * 100).toFixed(2);
21
+ }
22
+
23
+ module.exports = {
24
+ calculateAccuracy
25
+ };