monocr 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +115 -0
- package/bin/monocr.js +85 -0
- package/package.json +42 -0
- package/src/index.js +87 -0
- package/src/monocr.js +153 -0
- package/src/segmenter.js +136 -0
- package/src/utils.js +25 -0
package/README.md
ADDED
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
# monocr (JavaScript/Node.js)
|
|
2
|
+
|
|
3
|
+
Mon language OCR using ONNX Runtime for Node.js applications.
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
npm install monocr
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
## Quick Start
|
|
12
|
+
|
|
13
|
+
```javascript
|
|
14
|
+
const { read_image } = require("monocr");
|
|
15
|
+
|
|
16
|
+
const text = await read_image("path/to/image.jpg", "model.onnx", "charset.txt");
|
|
17
|
+
console.log(text);
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
## API
|
|
21
|
+
|
|
22
|
+
### read_image(imagePath, modelPath, charsetPath)
|
|
23
|
+
|
|
24
|
+
Recognize text from an image file.
|
|
25
|
+
|
|
26
|
+
**Parameters:**
|
|
27
|
+
|
|
28
|
+
- `imagePath` - Path to image file (jpg, png)
|
|
29
|
+
- `modelPath` - Path to ONNX model (optional, defaults to `../model/monocr.onnx`)
|
|
30
|
+
- `charsetPath` - Path to charset file (optional, defaults to `../model/charset.txt`)
|
|
31
|
+
|
|
32
|
+
**Returns:** `Promise<string>` - Recognized text
|
|
33
|
+
|
|
34
|
+
### read_pdf(pdfPath, modelPath, charsetPath)
|
|
35
|
+
|
|
36
|
+
Recognize text from a PDF file.
|
|
37
|
+
|
|
38
|
+
**Parameters:**
|
|
39
|
+
|
|
40
|
+
- `pdfPath` - Path to PDF file
|
|
41
|
+
- `modelPath` - Path to ONNX model (optional)
|
|
42
|
+
- `charsetPath` - Path to charset file (optional)
|
|
43
|
+
|
|
44
|
+
**Returns:** `Promise<string[]>` - Array of text per page
|
|
45
|
+
|
|
46
|
+
### read_image_with_accuracy(imagePath, groundTruth, modelPath, charsetPath)
|
|
47
|
+
|
|
48
|
+
Recognize text with accuracy measurement.
|
|
49
|
+
|
|
50
|
+
**Parameters:**
|
|
51
|
+
|
|
52
|
+
- `imagePath` - Path to image file
|
|
53
|
+
- `groundTruth` - Expected text for accuracy calculation
|
|
54
|
+
- `modelPath` - Path to ONNX model (optional)
|
|
55
|
+
- `charsetPath` - Path to charset file (optional)
|
|
56
|
+
|
|
57
|
+
**Returns:** `Promise<{text: string, accuracy: number}>` - Text and accuracy percentage
|
|
58
|
+
|
|
59
|
+
### MonOCR Class
|
|
60
|
+
|
|
61
|
+
For advanced usage, use the `MonOCR` class directly:
|
|
62
|
+
|
|
63
|
+
```javascript
|
|
64
|
+
const { MonOCR } = require("monocr");
|
|
65
|
+
|
|
66
|
+
const ocr = new MonOCR("model.onnx", "charset.txt");
|
|
67
|
+
await ocr.init();
|
|
68
|
+
|
|
69
|
+
// Single line
|
|
70
|
+
const text = await ocr.predictLine(imageSource);
|
|
71
|
+
|
|
72
|
+
// Full page (with line segmentation)
|
|
73
|
+
const results = await ocr.predictPage(imagePath);
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
## CLI
|
|
77
|
+
|
|
78
|
+
The package includes a command-line tool:
|
|
79
|
+
|
|
80
|
+
```bash
|
|
81
|
+
# Single image
|
|
82
|
+
monocr image path/to/image.jpg
|
|
83
|
+
|
|
84
|
+
# PDF файл
|
|
85
|
+
monocr pdf path/to/document.pdf
|
|
86
|
+
|
|
87
|
+
# Batch processing
|
|
88
|
+
monocr batch path/to/images/ -o results.json
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
## Examples
|
|
92
|
+
|
|
93
|
+
See the `examples/` directory for detailed usage examples:
|
|
94
|
+
|
|
95
|
+
- `simple.js` - Basic image OCR
|
|
96
|
+
- `with-accuracy.js` - OCR with accuracy measurement
|
|
97
|
+
- `batch.js` - Batch processing
|
|
98
|
+
|
|
99
|
+
Run examples:
|
|
100
|
+
|
|
101
|
+
```bash
|
|
102
|
+
node examples/simple.js
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
## Model Files
|
|
106
|
+
|
|
107
|
+
This package requires the ONNX model and charset files. Download them from:
|
|
108
|
+
|
|
109
|
+
- [GitHub Releases](https://github.com/janakh/monocr-onnx/releases)
|
|
110
|
+
|
|
111
|
+
Place them in a `model/` directory or specify custom paths in the API calls.
|
|
112
|
+
|
|
113
|
+
## License
|
|
114
|
+
|
|
115
|
+
MIT
|
package/bin/monocr.js
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
const { program } = require('commander');
|
|
4
|
+
const { read_image, read_pdf } = require('../src/index');
|
|
5
|
+
const fs = require('fs');
|
|
6
|
+
const path = require('path');
|
|
7
|
+
|
|
8
|
+
program
|
|
9
|
+
.name('monocr')
|
|
10
|
+
.description('Mon language OCR using ONNX Runtime')
|
|
11
|
+
.version('0.1.0');
|
|
12
|
+
|
|
13
|
+
program
|
|
14
|
+
.command('image <path>')
|
|
15
|
+
.description('Recognize text from an image file')
|
|
16
|
+
.option('-m, --model <path>', 'Path to ONNX model', '../model/monocr.onnx')
|
|
17
|
+
.option('-c, --charset <path>', 'Path to charset file', '../model/charset.txt')
|
|
18
|
+
.action(async (imagePath, options) => {
|
|
19
|
+
try {
|
|
20
|
+
const text = await read_image(imagePath, options.model, options.charset);
|
|
21
|
+
console.log(text);
|
|
22
|
+
} catch (err) {
|
|
23
|
+
console.error('Error:', err.message);
|
|
24
|
+
process.exit(1);
|
|
25
|
+
}
|
|
26
|
+
});
|
|
27
|
+
|
|
28
|
+
program
|
|
29
|
+
.command('pdf <path>')
|
|
30
|
+
.description('Recognize text from a PDF file')
|
|
31
|
+
.option('-m, --model <path>', 'Path to ONNX model', '../model/monocr.onnx')
|
|
32
|
+
.option('-c, --charset <path>', 'Path to charset file', '../model/charset.txt')
|
|
33
|
+
.action(async (pdfPath, options) => {
|
|
34
|
+
try {
|
|
35
|
+
const pages = await read_pdf(pdfPath, options.model, options.charset);
|
|
36
|
+
pages.forEach((pageText, i) => {
|
|
37
|
+
console.log(`--- Page ${i + 1} ---`);
|
|
38
|
+
console.log(pageText);
|
|
39
|
+
console.log();
|
|
40
|
+
});
|
|
41
|
+
} catch (err) {
|
|
42
|
+
console.error('Error:', err.message);
|
|
43
|
+
process.exit(1);
|
|
44
|
+
}
|
|
45
|
+
});
|
|
46
|
+
|
|
47
|
+
program
|
|
48
|
+
.command('batch <directory>')
|
|
49
|
+
.description('Process all images in a directory')
|
|
50
|
+
.option('-m, --model <path>', 'Path to ONNX model', '../model/monocr.onnx')
|
|
51
|
+
.option('-c, --charset <path>', 'Path to charset file', '../model/charset.txt')
|
|
52
|
+
.option('-o, --output <path>', 'Output file for results (optional)')
|
|
53
|
+
.action(async (directory, options) => {
|
|
54
|
+
try {
|
|
55
|
+
const files = fs.readdirSync(directory)
|
|
56
|
+
.filter(f => /\.(jpg|jpeg|png)$/i.test(f))
|
|
57
|
+
.sort();
|
|
58
|
+
|
|
59
|
+
const results = [];
|
|
60
|
+
|
|
61
|
+
for (const file of files) {
|
|
62
|
+
const filePath = path.join(directory, file);
|
|
63
|
+
console.error(`Processing: ${file}...`);
|
|
64
|
+
|
|
65
|
+
try {
|
|
66
|
+
const text = await read_image(filePath, options.model, options.charset);
|
|
67
|
+
results.push({ file, text, success: true });
|
|
68
|
+
} catch (err) {
|
|
69
|
+
results.push({ file, error: err.message, success: false });
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
if (options.output) {
|
|
74
|
+
fs.writeFileSync(options.output, JSON.stringify(results, null, 2));
|
|
75
|
+
console.error(`Results written to ${options.output}`);
|
|
76
|
+
} else {
|
|
77
|
+
console.log(JSON.stringify(results, null, 2));
|
|
78
|
+
}
|
|
79
|
+
} catch (err) {
|
|
80
|
+
console.error('Error:', err.message);
|
|
81
|
+
process.exit(1);
|
|
82
|
+
}
|
|
83
|
+
});
|
|
84
|
+
|
|
85
|
+
program.parse();
|
package/package.json
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "monocr",
|
|
3
|
+
"version": "0.1.1",
|
|
4
|
+
"description": "Cross-platform Mon (mnw) language OCR using ONNX Runtime. Supports Node.js.",
|
|
5
|
+
"main": "src/index.js",
|
|
6
|
+
"bin": {
|
|
7
|
+
"monocr": "bin/monocr.js"
|
|
8
|
+
},
|
|
9
|
+
"repository": {
|
|
10
|
+
"type": "git",
|
|
11
|
+
"url": "git+https://github.com/janakh/monocr-onnx.git",
|
|
12
|
+
"directory": "js"
|
|
13
|
+
},
|
|
14
|
+
"keywords": [
|
|
15
|
+
"ocr",
|
|
16
|
+
"mon",
|
|
17
|
+
"mnw",
|
|
18
|
+
"onnx",
|
|
19
|
+
"machine-learning"
|
|
20
|
+
],
|
|
21
|
+
"author": "Janakh",
|
|
22
|
+
"license": "MIT",
|
|
23
|
+
"bugs": {
|
|
24
|
+
"url": "https://github.com/janakh/monocr-onnx/issues"
|
|
25
|
+
},
|
|
26
|
+
"homepage": "https://github.com/janakh/monocr-onnx/tree/main/js#readme",
|
|
27
|
+
"files": [
|
|
28
|
+
"src/",
|
|
29
|
+
"bin/",
|
|
30
|
+
"README.md",
|
|
31
|
+
"LICENSE"
|
|
32
|
+
],
|
|
33
|
+
"dependencies": {
|
|
34
|
+
"commander": "^11.1.0",
|
|
35
|
+
"onnxruntime-node": "^1.15.0",
|
|
36
|
+
"pdf2pic": "^3.2.0",
|
|
37
|
+
"sharp": "^0.32.0"
|
|
38
|
+
},
|
|
39
|
+
"publishConfig": {
|
|
40
|
+
"access": "public"
|
|
41
|
+
}
|
|
42
|
+
}
|
package/src/index.js
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
const MonOCR = require('./monocr');
|
|
2
|
+
const { calculateAccuracy } = require('./utils');
|
|
3
|
+
|
|
4
|
+
module.exports = {
|
|
5
|
+
MonOCR,
|
|
6
|
+
calculateAccuracy,
|
|
7
|
+
read_image,
|
|
8
|
+
read_pdf,
|
|
9
|
+
read_image_with_accuracy
|
|
10
|
+
};
|
|
11
|
+
|
|
12
|
+
/**
|
|
13
|
+
* Read text from an image file
|
|
14
|
+
* @param {string} imagePath - Path to image file
|
|
15
|
+
* @param {string} modelPath - Path to ONNX model
|
|
16
|
+
* @param {string} charsetPath - Path to charset file
|
|
17
|
+
* @returns {Promise<string>} Recognized text
|
|
18
|
+
*/
|
|
19
|
+
async function read_image(imagePath, modelPath = '../model/monocr.onnx', charsetPath = '../model/charset.txt') {
|
|
20
|
+
const ocr = new MonOCR(modelPath, charsetPath);
|
|
21
|
+
await ocr.init();
|
|
22
|
+
const results = await ocr.predictPage(imagePath);
|
|
23
|
+
return results.map(r => r.text).join('\n');
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
/**
|
|
27
|
+
* Read text from a PDF file
|
|
28
|
+
* @param {string} pdfPath - Path to PDF file
|
|
29
|
+
* @param {string} modelPath - Path to ONNX model
|
|
30
|
+
* @param {string} charsetPath - Path to charset file
|
|
31
|
+
* @returns {Promise<string[]>} Array of text per page
|
|
32
|
+
*/
|
|
33
|
+
async function read_pdf(pdfPath, modelPath = '../model/monocr.onnx', charsetPath = '../model/charset.txt') {
|
|
34
|
+
const { fromPath } = require('pdf2pic');
|
|
35
|
+
const path = require('path');
|
|
36
|
+
const fs = require('fs');
|
|
37
|
+
const os = require('os');
|
|
38
|
+
|
|
39
|
+
const ocr = new MonOCR(modelPath, charsetPath);
|
|
40
|
+
await ocr.init();
|
|
41
|
+
|
|
42
|
+
const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'monocr-'));
|
|
43
|
+
|
|
44
|
+
const converter = fromPath(pdfPath, {
|
|
45
|
+
density: 300,
|
|
46
|
+
format: 'png',
|
|
47
|
+
width: 2480,
|
|
48
|
+
height: 3508,
|
|
49
|
+
saveFilename: 'page',
|
|
50
|
+
savePath: tempDir
|
|
51
|
+
});
|
|
52
|
+
|
|
53
|
+
const pages = [];
|
|
54
|
+
let pageNum = 1;
|
|
55
|
+
|
|
56
|
+
while (true) {
|
|
57
|
+
try {
|
|
58
|
+
const result = await converter(pageNum, { responseType: 'image' });
|
|
59
|
+
const imagePath = result.path;
|
|
60
|
+
const results = await ocr.predictPage(imagePath);
|
|
61
|
+
const pageText = results.map(r => r.text).join('\n');
|
|
62
|
+
pages.push(pageText);
|
|
63
|
+
pageNum++;
|
|
64
|
+
} catch (err) {
|
|
65
|
+
break;
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
// Cleanup temp directory
|
|
70
|
+
fs.rmSync(tempDir, { recursive: true, force: true });
|
|
71
|
+
|
|
72
|
+
return pages;
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
/**
|
|
76
|
+
* Read text from an image with accuracy measurement
|
|
77
|
+
* @param {string} imagePath - Path to image file
|
|
78
|
+
* @param {string} groundTruth - Expected text for accuracy calculation
|
|
79
|
+
* @param {string} modelPath - Path to ONNX model
|
|
80
|
+
* @param {string} charsetPath - Path to charset file
|
|
81
|
+
* @returns {Promise<{text: string, accuracy: number}>}
|
|
82
|
+
*/
|
|
83
|
+
async function read_image_with_accuracy(imagePath, groundTruth, modelPath = '../model/monocr.onnx', charsetPath = '../model/charset.txt') {
|
|
84
|
+
const text = await read_image(imagePath, modelPath, charsetPath);
|
|
85
|
+
const accuracy = calculateAccuracy(text, groundTruth);
|
|
86
|
+
return { text, accuracy: parseFloat(accuracy) };
|
|
87
|
+
}
|
package/src/monocr.js
ADDED
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
const ort = require('onnxruntime-node');
|
|
2
|
+
const sharp = require('sharp');
|
|
3
|
+
const fs = require('fs');
|
|
4
|
+
const LineSegmenter = require('./segmenter');
|
|
5
|
+
|
|
6
|
+
class MonOCR {
|
|
7
|
+
constructor(modelPath, charsetPath) {
|
|
8
|
+
this.modelPath = modelPath;
|
|
9
|
+
this.charsetPath = charsetPath;
|
|
10
|
+
this.session = null;
|
|
11
|
+
this.charset = "";
|
|
12
|
+
this.segmenter = new LineSegmenter();
|
|
13
|
+
|
|
14
|
+
// Metadata
|
|
15
|
+
this.targetHeight = 64;
|
|
16
|
+
this.targetWidth = 1024;
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
async init() {
|
|
20
|
+
if (this.session) return;
|
|
21
|
+
this.session = await ort.InferenceSession.create(this.modelPath);
|
|
22
|
+
if (this.charsetPath) {
|
|
23
|
+
this.charset = fs.readFileSync(this.charsetPath, 'utf-8').trim();
|
|
24
|
+
}
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
/**
|
|
28
|
+
* Replicates Python's resize_and_pad:
|
|
29
|
+
* 1. Resize height to 64, maintain aspect ratio.
|
|
30
|
+
* 2. Pad width to 1024 (white background).
|
|
31
|
+
* 3. Normalize to [-1, 1].
|
|
32
|
+
*/
|
|
33
|
+
async preprocess(imageSource) {
|
|
34
|
+
let sharpImg;
|
|
35
|
+
if (typeof imageSource.metadata === 'function') {
|
|
36
|
+
sharpImg = imageSource;
|
|
37
|
+
} else {
|
|
38
|
+
sharpImg = sharp(imageSource);
|
|
39
|
+
}
|
|
40
|
+
const metadata = await sharpImg.metadata();
|
|
41
|
+
|
|
42
|
+
const scale = this.targetHeight / metadata.height;
|
|
43
|
+
const newWidth = Math.min(this.targetWidth, Math.round(metadata.width * scale));
|
|
44
|
+
|
|
45
|
+
// Create the grayscale resized image
|
|
46
|
+
const resizedBuffer = await sharpImg
|
|
47
|
+
.grayscale()
|
|
48
|
+
.resize({
|
|
49
|
+
height: this.targetHeight,
|
|
50
|
+
width: newWidth,
|
|
51
|
+
fit: 'fill'
|
|
52
|
+
})
|
|
53
|
+
.raw()
|
|
54
|
+
.toBuffer();
|
|
55
|
+
|
|
56
|
+
// Create target canvas (1024 width, white background = 255)
|
|
57
|
+
const totalSize = this.targetHeight * this.targetWidth;
|
|
58
|
+
const canvas = new Float32Array(totalSize).fill(1.0);
|
|
59
|
+
|
|
60
|
+
// Fill canvas with resized image and normalize
|
|
61
|
+
// Python: canvas = canvas.astype(np.float32) / 127.5 - 1.0
|
|
62
|
+
// 255 -> 1.0
|
|
63
|
+
// 0 -> -1.0
|
|
64
|
+
|
|
65
|
+
for (let y = 0; y < this.targetHeight; y++) {
|
|
66
|
+
for (let x = 0; x < this.targetWidth; x++) {
|
|
67
|
+
const canvasIdx = y * this.targetWidth + x;
|
|
68
|
+
if (x < newWidth) {
|
|
69
|
+
const imgIdx = y * newWidth + x;
|
|
70
|
+
const pixelValue = resizedBuffer[imgIdx];
|
|
71
|
+
canvas[canvasIdx] = (pixelValue / 127.5) - 1.0;
|
|
72
|
+
} else {
|
|
73
|
+
// Padding is white
|
|
74
|
+
canvas[canvasIdx] = (255 / 127.5) - 1.0; // 1.0
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
return new ort.Tensor('float32', canvas, [1, 1, this.targetHeight, this.targetWidth]);
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
/**
|
|
83
|
+
* CTC Greedy Decoding
|
|
84
|
+
* Ignores blank (0) and contracts repeats.
|
|
85
|
+
*/
|
|
86
|
+
decode(outputTensor) {
|
|
87
|
+
const data = outputTensor.data;
|
|
88
|
+
const dims = outputTensor.dims; // [Batch, Time, Classes]
|
|
89
|
+
const numClasses = dims[2];
|
|
90
|
+
const sequenceLength = dims[1];
|
|
91
|
+
|
|
92
|
+
const idx2char = {};
|
|
93
|
+
for (let i = 0; i < this.charset.length; i++) {
|
|
94
|
+
idx2char[i + 1] = this.charset[i];
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
let decodedText = "";
|
|
98
|
+
let prevIdx = -1;
|
|
99
|
+
|
|
100
|
+
for (let t = 0; t < sequenceLength; t++) {
|
|
101
|
+
let maxVal = -Infinity;
|
|
102
|
+
let maxIdx = 0;
|
|
103
|
+
for (let c = 0; c < numClasses; c++) {
|
|
104
|
+
const val = data[t * numClasses + c];
|
|
105
|
+
if (val > maxVal) {
|
|
106
|
+
maxVal = val;
|
|
107
|
+
maxIdx = c;
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
// CTC logic: 0 is blank, ignore repeats
|
|
112
|
+
if (maxIdx !== 0 && maxIdx !== prevIdx) {
|
|
113
|
+
decodedText += idx2char[maxIdx] || "";
|
|
114
|
+
}
|
|
115
|
+
prevIdx = maxIdx;
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
return decodedText;
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
async predictLine(imageSource) {
|
|
122
|
+
if (!this.session) await this.init();
|
|
123
|
+
|
|
124
|
+
const inputTensor = await this.preprocess(imageSource);
|
|
125
|
+
const feeds = {};
|
|
126
|
+
feeds[this.session.inputNames[0]] = inputTensor;
|
|
127
|
+
|
|
128
|
+
const results = await this.session.run(feeds);
|
|
129
|
+
const outputTensor = results[this.session.outputNames[0]];
|
|
130
|
+
|
|
131
|
+
return this.decode(outputTensor);
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
/**
|
|
135
|
+
* Processes full page: segments into lines and predicts each.
|
|
136
|
+
*/
|
|
137
|
+
async predictPage(imagePath) {
|
|
138
|
+
const lines = await this.segmenter.segment(imagePath);
|
|
139
|
+
const results = [];
|
|
140
|
+
|
|
141
|
+
for (const line of lines) {
|
|
142
|
+
const text = await this.predictLine(line.img);
|
|
143
|
+
results.push({
|
|
144
|
+
text,
|
|
145
|
+
bbox: line.bbox
|
|
146
|
+
});
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
return results;
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
module.exports = MonOCR;
|
package/src/segmenter.js
ADDED
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
const sharp = require('sharp');
|
|
2
|
+
|
|
3
|
+
class LineSegmenter {
|
|
4
|
+
/**
|
|
5
|
+
* @param {number} minLineH Minimum height of a line to be considered valid.
|
|
6
|
+
* @param {number} smoothWindow Smoothing window for projection profile.
|
|
7
|
+
*/
|
|
8
|
+
constructor(minLineH = 10, smoothWindow = 3) {
|
|
9
|
+
this.minLineH = minLineH;
|
|
10
|
+
this.smoothWindow = smoothWindow;
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
/**
|
|
14
|
+
* Segment a document image into text lines.
|
|
15
|
+
* @param {string|Buffer} imagePath Path to image or Buffer.
|
|
16
|
+
* @returns {Promise<Array<{img: sharp.Sharp, bbox: {x: number, y: number, w: number, h: number}}>>}
|
|
17
|
+
*/
|
|
18
|
+
async segment(imagePath) {
|
|
19
|
+
const image = sharp(imagePath);
|
|
20
|
+
const { width, height } = await image.metadata();
|
|
21
|
+
|
|
22
|
+
// 1. Get raw grayscale data for thresholding
|
|
23
|
+
const grayBuffer = await image
|
|
24
|
+
.grayscale()
|
|
25
|
+
.raw()
|
|
26
|
+
.toBuffer();
|
|
27
|
+
|
|
28
|
+
// 2. Simple Adaptive-ish Thresholding
|
|
29
|
+
// Since we don't have CV2's adaptiveThreshold easily, we'll do a simple threshold
|
|
30
|
+
// or just use sharp's threshold if we can get the mask.
|
|
31
|
+
// Actually, to replicate Horizontal Projection, we need the sum of "text" pixels.
|
|
32
|
+
// We'll treat dark pixels (< 128) as text (since background is white).
|
|
33
|
+
const binary = new Uint8Array(grayBuffer.length);
|
|
34
|
+
const hist = new Float32Array(height).fill(0);
|
|
35
|
+
|
|
36
|
+
for (let y = 0; y < height; y++) {
|
|
37
|
+
for (let x = 0; x < width; x++) {
|
|
38
|
+
const idx = y * width + x;
|
|
39
|
+
// Threshold: 128 is a safe bet for black text on white paper.
|
|
40
|
+
// Inverted so text is "high" (1) and background is 0.
|
|
41
|
+
if (grayBuffer[idx] < 128) {
|
|
42
|
+
binary[idx] = 1;
|
|
43
|
+
hist[y]++;
|
|
44
|
+
} else {
|
|
45
|
+
binary[idx] = 0;
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
// 3. Smoothing projection profile
|
|
51
|
+
let smoothedHist = hist;
|
|
52
|
+
if (this.smoothWindow > 1) {
|
|
53
|
+
smoothedHist = new Float32Array(height);
|
|
54
|
+
const half = Math.floor(this.smoothWindow / 2);
|
|
55
|
+
for (let i = 0; i < height; i++) {
|
|
56
|
+
let sum = 0;
|
|
57
|
+
let count = 0;
|
|
58
|
+
for (let j = i - half; j <= i + half; j++) {
|
|
59
|
+
if (j >= 0 && j < height) {
|
|
60
|
+
sum += hist[j];
|
|
61
|
+
count++;
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
smoothedHist[i] = sum / count;
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
// 4. Gap Detection
|
|
69
|
+
const nonZeroVals = smoothedHist.filter(v => v > 0);
|
|
70
|
+
if (nonZeroVals.length === 0) return [];
|
|
71
|
+
|
|
72
|
+
const meanDensity = nonZeroVals.reduce((a, b) => a + b, 0) / nonZeroVals.length;
|
|
73
|
+
const gapThreshold = meanDensity * 0.05;
|
|
74
|
+
|
|
75
|
+
const results = [];
|
|
76
|
+
let start = null;
|
|
77
|
+
|
|
78
|
+
for (let y = 0; y < height; y++) {
|
|
79
|
+
const isText = smoothedHist[y] > gapThreshold;
|
|
80
|
+
if (isText && start === null) {
|
|
81
|
+
start = y;
|
|
82
|
+
} else if (!isText && start !== null) {
|
|
83
|
+
const end = y;
|
|
84
|
+
if (end - start >= this.minLineH) {
|
|
85
|
+
await this._extractLine(image, grayBuffer, width, height, start, end, results);
|
|
86
|
+
}
|
|
87
|
+
start = null;
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
if (start !== null && (height - start) >= this.minLineH) {
|
|
92
|
+
await this._extractLine(image, grayBuffer, width, height, start, height, results);
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
return results;
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
async _extractLine(image, grayBuffer, width, height, rStart, rEnd, results) {
|
|
99
|
+
// Find horizontal bounds within this vertical strip
|
|
100
|
+
let xMin = width;
|
|
101
|
+
let xMax = 0;
|
|
102
|
+
let hasPixels = false;
|
|
103
|
+
|
|
104
|
+
for (let y = rStart; y < rEnd; y++) {
|
|
105
|
+
for (let x = 0; x < width; x++) {
|
|
106
|
+
if (grayBuffer[y * width + x] < 128) {
|
|
107
|
+
if (x < xMin) xMin = x;
|
|
108
|
+
if (x > xMax) xMax = x;
|
|
109
|
+
hasPixels = true;
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
if (!hasPixels) return;
|
|
115
|
+
|
|
116
|
+
// Add padding
|
|
117
|
+
const pad = 4;
|
|
118
|
+
const y1 = Math.max(0, rStart - pad);
|
|
119
|
+
const y2 = Math.min(height, rEnd + pad);
|
|
120
|
+
const x1 = Math.max(0, xMin - pad);
|
|
121
|
+
const x2 = Math.min(width, xMax + pad);
|
|
122
|
+
|
|
123
|
+
const w = x2 - x1;
|
|
124
|
+
const h = y2 - y1;
|
|
125
|
+
|
|
126
|
+
// Crop the line
|
|
127
|
+
const crop = image.clone().extract({ left: x1, top: y1, width: w, height: h });
|
|
128
|
+
|
|
129
|
+
results.push({
|
|
130
|
+
img: crop,
|
|
131
|
+
bbox: { x: x1, y: y1, w, h }
|
|
132
|
+
});
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
module.exports = LineSegmenter;
|
package/src/utils.js
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Calculate character error rate between predicted and ground truth text
|
|
3
|
+
* @param {string} predicted - Predicted text
|
|
4
|
+
* @param {string} groundTruth - Ground truth text
|
|
5
|
+
* @returns {number} Accuracy percentage (0-100)
|
|
6
|
+
*/
|
|
7
|
+
function calculateAccuracy(predicted, groundTruth) {
|
|
8
|
+
if (!groundTruth) return 0;
|
|
9
|
+
|
|
10
|
+
const len = Math.max(predicted.length, groundTruth.length);
|
|
11
|
+
if (len === 0) return 100;
|
|
12
|
+
|
|
13
|
+
let errors = 0;
|
|
14
|
+
for (let i = 0; i < len; i++) {
|
|
15
|
+
if (predicted[i] !== groundTruth[i]) {
|
|
16
|
+
errors++;
|
|
17
|
+
}
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
return ((1 - errors / len) * 100).toFixed(2);
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
module.exports = {
|
|
24
|
+
calculateAccuracy
|
|
25
|
+
};
|