@areb0s/ocr-common 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/Ocr.d.ts +15 -0
- package/build/Ocr.js +24 -0
- package/build/Ocr.js.map +1 -0
- package/build/backend/FileUtilsBase.d.ts +3 -0
- package/build/backend/FileUtilsBase.js +6 -0
- package/build/backend/FileUtilsBase.js.map +1 -0
- package/build/backend/ImageRawBase.d.ts +8 -0
- package/build/backend/ImageRawBase.js +14 -0
- package/build/backend/ImageRawBase.js.map +1 -0
- package/build/backend/backend.d.ts +14 -0
- package/build/backend/backend.js +14 -0
- package/build/backend/backend.js.map +1 -0
- package/build/backend/index.d.ts +3 -0
- package/build/backend/index.js +4 -0
- package/build/backend/index.js.map +1 -0
- package/build/backend/splitIntoLineImages.d.ts +2 -0
- package/build/backend/splitIntoLineImages.js +231 -0
- package/build/backend/splitIntoLineImages.js.map +1 -0
- package/build/index.d.ts +6 -0
- package/build/index.js +6 -0
- package/build/index.js.map +1 -0
- package/build/models/Detection.d.ts +13 -0
- package/build/models/Detection.js +82 -0
- package/build/models/Detection.js.map +1 -0
- package/build/models/ModelBase.d.ts +14 -0
- package/build/models/ModelBase.js +53 -0
- package/build/models/ModelBase.js.map +1 -0
- package/build/models/Recognition.d.ts +12 -0
- package/build/models/Recognition.js +190 -0
- package/build/models/Recognition.js.map +1 -0
- package/build/models/index.d.ts +2 -0
- package/build/models/index.js +3 -0
- package/build/models/index.js.map +1 -0
- package/build/types/index.d.ts +1 -0
- package/build/types/index.js +2 -0
- package/build/types/index.js.map +1 -0
- package/build/types/types.d.ts +67 -0
- package/build/types/types.js +5 -0
- package/build/types/types.js.map +1 -0
- package/package.json +34 -0
- package/src/Ocr.ts +34 -0
- package/src/backend/FileUtilsBase.ts +5 -0
- package/src/backend/ImageRawBase.ts +17 -0
- package/src/backend/backend.ts +29 -0
- package/src/backend/index.ts +3 -0
- package/src/backend/splitIntoLineImages.ts +296 -0
- package/src/index.ts +8 -0
- package/src/models/Detection.ts +96 -0
- package/src/models/ModelBase.ts +74 -0
- package/src/models/Recognition.ts +217 -0
- package/src/models/index.ts +2 -0
- package/src/types/global.d.ts +7 -0
- package/src/types/index.ts +1 -0
- package/src/types/types.ts +90 -0
- package/tsconfig.json +10 -0
|
@@ -0,0 +1,296 @@
|
|
|
1
|
+
import cv from '@techstark/opencv-js'
|
|
2
|
+
import clipper from 'js-clipper'
|
|
3
|
+
import { ImageRaw } from '#common/backend'
|
|
4
|
+
import type { LineImage, ImageRaw as ImageRawType } from '#common/types'
|
|
5
|
+
|
|
6
|
+
type pointType = [number, number]
|
|
7
|
+
type BoxType = [pointType, pointType, pointType, pointType]
|
|
8
|
+
type pointsType = pointType[]
|
|
9
|
+
|
|
10
|
+
export async function splitIntoLineImages(image: ImageRawType, sourceImage: ImageRawType): Promise<LineImage[]> {
|
|
11
|
+
const w = image.width
|
|
12
|
+
const h = image.height
|
|
13
|
+
const srcData = sourceImage
|
|
14
|
+
|
|
15
|
+
const edgeRect: { box: BoxType; image: ImageRawType }[] = []
|
|
16
|
+
|
|
17
|
+
const src = cvImread(image)
|
|
18
|
+
|
|
19
|
+
cv.cvtColor(src, src, cv.COLOR_RGBA2GRAY, 0)
|
|
20
|
+
const contours = new cv.MatVector()
|
|
21
|
+
const hierarchy = new cv.Mat()
|
|
22
|
+
|
|
23
|
+
cv.findContours(src, contours, hierarchy, cv.RETR_LIST, cv.CHAIN_APPROX_SIMPLE)
|
|
24
|
+
|
|
25
|
+
for (let i = 0; i < contours.size(); i++) {
|
|
26
|
+
const minSize = 3
|
|
27
|
+
const cnt = contours.get(i)
|
|
28
|
+
const { points, sside } = getMiniBoxes(cnt)
|
|
29
|
+
if (sside < minSize) continue
|
|
30
|
+
// TODO sort fast
|
|
31
|
+
|
|
32
|
+
const clipBox = unclip(points)
|
|
33
|
+
|
|
34
|
+
const boxMap = cv.matFromArray(clipBox.length / 2, 1, cv.CV_32SC2, clipBox)
|
|
35
|
+
|
|
36
|
+
const resultObj = getMiniBoxes(boxMap)
|
|
37
|
+
const box = resultObj.points
|
|
38
|
+
if (resultObj.sside < minSize + 2) {
|
|
39
|
+
continue
|
|
40
|
+
}
|
|
41
|
+
function clip(n: number, min: number, max: number) {
|
|
42
|
+
return Math.max(min, Math.min(n, max))
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
const rx = srcData.width / w
|
|
46
|
+
const ry = srcData.height / h
|
|
47
|
+
|
|
48
|
+
for (let i = 0; i < box.length; i++) {
|
|
49
|
+
box[i][0] *= rx
|
|
50
|
+
box[i][1] *= ry
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
const box1 = orderPointsClockwise(box)
|
|
54
|
+
box1.forEach((item) => {
|
|
55
|
+
item[0] = clip(Math.round(item[0]), 0, srcData.width)
|
|
56
|
+
item[1] = clip(Math.round(item[1]), 0, srcData.height)
|
|
57
|
+
})
|
|
58
|
+
const rect_width = int(linalgNorm(box1[0], box1[1]))
|
|
59
|
+
const rect_height = int(linalgNorm(box1[0], box1[3]))
|
|
60
|
+
if (rect_width <= 3 || rect_height <= 3) continue
|
|
61
|
+
|
|
62
|
+
const c = getRotateCropImage(srcData, box)
|
|
63
|
+
|
|
64
|
+
edgeRect.push({
|
|
65
|
+
box,
|
|
66
|
+
image: c,
|
|
67
|
+
})
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
src.delete()
|
|
71
|
+
contours.delete()
|
|
72
|
+
hierarchy.delete()
|
|
73
|
+
|
|
74
|
+
return edgeRect
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
function getMiniBoxes(contour: any) {
|
|
78
|
+
const boundingBox = cv.minAreaRect(contour)
|
|
79
|
+
const points = Array.from(boxPoints(boundingBox.center, boundingBox.size, boundingBox.angle)).sort(
|
|
80
|
+
(a, b) => a[0] - b[0],
|
|
81
|
+
) as pointsType
|
|
82
|
+
|
|
83
|
+
let index_1 = 0,
|
|
84
|
+
index_2 = 1,
|
|
85
|
+
index_3 = 2,
|
|
86
|
+
index_4 = 3
|
|
87
|
+
if (points[1][1] > points[0][1]) {
|
|
88
|
+
index_1 = 0
|
|
89
|
+
index_4 = 1
|
|
90
|
+
} else {
|
|
91
|
+
index_1 = 1
|
|
92
|
+
index_4 = 0
|
|
93
|
+
}
|
|
94
|
+
if (points[3][1] > points[2][1]) {
|
|
95
|
+
index_2 = 2
|
|
96
|
+
index_3 = 3
|
|
97
|
+
} else {
|
|
98
|
+
index_2 = 3
|
|
99
|
+
index_3 = 2
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
const box = [points[index_1], points[index_2], points[index_3], points[index_4]] as BoxType
|
|
103
|
+
const side = Math.min(boundingBox.size.height, boundingBox.size.width)
|
|
104
|
+
return { points: box, sside: side }
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
function unclip(box: pointsType) {
|
|
108
|
+
const unclip_ratio = 1.5
|
|
109
|
+
const area = Math.abs(polygonPolygonArea(box))
|
|
110
|
+
const length = polygonPolygonLength(box)
|
|
111
|
+
const distance = (area * unclip_ratio) / length
|
|
112
|
+
const tmpArr: { X: number; Y: number }[] = []
|
|
113
|
+
box.forEach((item) => {
|
|
114
|
+
const obj = {
|
|
115
|
+
X: 0,
|
|
116
|
+
Y: 0,
|
|
117
|
+
}
|
|
118
|
+
obj.X = item[0]
|
|
119
|
+
obj.Y = item[1]
|
|
120
|
+
tmpArr.push(obj)
|
|
121
|
+
})
|
|
122
|
+
const offset = new clipper.ClipperOffset()
|
|
123
|
+
offset.AddPath(tmpArr, clipper.JoinType.jtRound, clipper.EndType.etClosedPolygon)
|
|
124
|
+
const expanded: { X: number; Y: number }[][] = []
|
|
125
|
+
offset.Execute(expanded, distance)
|
|
126
|
+
let expandedArr: pointsType = []
|
|
127
|
+
expanded[0] &&
|
|
128
|
+
expanded[0].forEach((item) => {
|
|
129
|
+
expandedArr.push([item.X, item.Y])
|
|
130
|
+
})
|
|
131
|
+
expandedArr = [].concat(...(<any>expandedArr))
|
|
132
|
+
|
|
133
|
+
return expandedArr
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
function orderPointsClockwise(pts: BoxType) {
|
|
137
|
+
const rect: BoxType = [
|
|
138
|
+
[0, 0],
|
|
139
|
+
[0, 0],
|
|
140
|
+
[0, 0],
|
|
141
|
+
[0, 0],
|
|
142
|
+
]
|
|
143
|
+
const s = pts.map((pt) => pt[0] + pt[1])
|
|
144
|
+
rect[0] = pts[s.indexOf(Math.min(...s))]
|
|
145
|
+
rect[2] = pts[s.indexOf(Math.max(...s))]
|
|
146
|
+
const tmp = pts.filter((pt) => pt !== rect[0] && pt !== rect[2])
|
|
147
|
+
const diff = tmp[1].map((e, i) => e - tmp[0][i])
|
|
148
|
+
rect[1] = tmp[diff.indexOf(Math.min(...diff))]
|
|
149
|
+
rect[3] = tmp[diff.indexOf(Math.max(...diff))]
|
|
150
|
+
return rect
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
function linalgNorm(p0: pointType, p1: pointType) {
|
|
154
|
+
return Math.sqrt(Math.pow(p0[0] - p1[0], 2) + Math.pow(p0[1] - p1[1], 2))
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
function int(num: number) {
|
|
158
|
+
return num > 0 ? Math.floor(num) : Math.ceil(num)
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
function getRotateCropImage(imageRaw: ImageRawType, points: BoxType): ImageRawType {
|
|
162
|
+
const img_crop_width = int(Math.max(linalgNorm(points[0], points[1]), linalgNorm(points[2], points[3])))
|
|
163
|
+
const img_crop_height = int(Math.max(linalgNorm(points[0], points[3]), linalgNorm(points[1], points[2])))
|
|
164
|
+
const pts_std = [
|
|
165
|
+
[0, 0],
|
|
166
|
+
[img_crop_width, 0],
|
|
167
|
+
[img_crop_width, img_crop_height],
|
|
168
|
+
[0, img_crop_height],
|
|
169
|
+
]
|
|
170
|
+
|
|
171
|
+
const srcTri = cv.matFromArray(4, 1, cv.CV_32FC2, flatten(points))
|
|
172
|
+
const dstTri = cv.matFromArray(4, 1, cv.CV_32FC2, flatten(pts_std))
|
|
173
|
+
|
|
174
|
+
// 获取到目标矩阵
|
|
175
|
+
const M = cv.getPerspectiveTransform(srcTri, dstTri)
|
|
176
|
+
const src = cvImread(imageRaw)
|
|
177
|
+
const dst = new cv.Mat()
|
|
178
|
+
const dsize = new cv.Size(img_crop_width, img_crop_height)
|
|
179
|
+
// 透视转换
|
|
180
|
+
cv.warpPerspective(src, dst, M, dsize, cv.INTER_CUBIC, cv.BORDER_REPLICATE, new cv.Scalar())
|
|
181
|
+
|
|
182
|
+
const dst_img_height = (<any>dst).matSize[0]
|
|
183
|
+
const dst_img_width = (<any>dst).matSize[1]
|
|
184
|
+
let dst_rot
|
|
185
|
+
// 图像旋转
|
|
186
|
+
if (dst_img_height / dst_img_width >= 1.5) {
|
|
187
|
+
dst_rot = new cv.Mat()
|
|
188
|
+
const dsize_rot = new cv.Size(dst.rows, dst.cols)
|
|
189
|
+
const center = new cv.Point(dst.cols / 2, dst.cols / 2)
|
|
190
|
+
const M = cv.getRotationMatrix2D(center, 90, 1)
|
|
191
|
+
cv.warpAffine(dst, dst_rot, M, dsize_rot, cv.INTER_CUBIC, cv.BORDER_REPLICATE, new cv.Scalar())
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
src.delete()
|
|
195
|
+
srcTri.delete()
|
|
196
|
+
dstTri.delete()
|
|
197
|
+
if (dst_rot) {
|
|
198
|
+
dst.delete()
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
return cvImshow(dst_rot || dst)
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
function boxPoints(center: { x: number; y: number }, size: { width: number; height: number }, angle: number) {
|
|
205
|
+
const width = size.width
|
|
206
|
+
const height = size.height
|
|
207
|
+
|
|
208
|
+
const theta = (angle * Math.PI) / 180.0
|
|
209
|
+
const cosTheta = Math.cos(theta)
|
|
210
|
+
const sinTheta = Math.sin(theta)
|
|
211
|
+
|
|
212
|
+
const cx = center.x
|
|
213
|
+
const cy = center.y
|
|
214
|
+
|
|
215
|
+
const dx = width * 0.5
|
|
216
|
+
const dy = height * 0.5
|
|
217
|
+
|
|
218
|
+
const rotatedPoints: any[] = []
|
|
219
|
+
|
|
220
|
+
// Top-Left
|
|
221
|
+
const x1 = cx - dx * cosTheta + dy * sinTheta
|
|
222
|
+
const y1 = cy - dx * sinTheta - dy * cosTheta
|
|
223
|
+
rotatedPoints.push([x1, y1])
|
|
224
|
+
|
|
225
|
+
// Top-Right
|
|
226
|
+
const x2 = cx + dx * cosTheta + dy * sinTheta
|
|
227
|
+
const y2 = cy + dx * sinTheta - dy * cosTheta
|
|
228
|
+
rotatedPoints.push([x2, y2])
|
|
229
|
+
|
|
230
|
+
// Bottom-Right
|
|
231
|
+
const x3 = cx + dx * cosTheta - dy * sinTheta
|
|
232
|
+
const y3 = cy + dx * sinTheta + dy * cosTheta
|
|
233
|
+
rotatedPoints.push([x3, y3])
|
|
234
|
+
|
|
235
|
+
// Bottom-Left
|
|
236
|
+
const x4 = cx - dx * cosTheta - dy * sinTheta
|
|
237
|
+
const y4 = cy - dx * sinTheta + dy * cosTheta
|
|
238
|
+
rotatedPoints.push([x4, y4])
|
|
239
|
+
|
|
240
|
+
return rotatedPoints
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
function polygonPolygonArea(polygon: pointsType) {
|
|
244
|
+
let i = -1,
|
|
245
|
+
n = polygon.length,
|
|
246
|
+
a: pointType,
|
|
247
|
+
b = polygon[n - 1],
|
|
248
|
+
area = 0
|
|
249
|
+
|
|
250
|
+
while (++i < n) {
|
|
251
|
+
a = b
|
|
252
|
+
b = polygon[i]
|
|
253
|
+
area += a[1] * b[0] - a[0] * b[1]
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
return area / 2
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
function polygonPolygonLength(polygon: pointsType) {
|
|
260
|
+
let i = -1,
|
|
261
|
+
n = polygon.length,
|
|
262
|
+
b = polygon[n - 1],
|
|
263
|
+
xa: number,
|
|
264
|
+
ya: number,
|
|
265
|
+
xb = b[0],
|
|
266
|
+
yb = b[1],
|
|
267
|
+
perimeter = 0
|
|
268
|
+
|
|
269
|
+
while (++i < n) {
|
|
270
|
+
xa = xb
|
|
271
|
+
ya = yb
|
|
272
|
+
b = polygon[i]
|
|
273
|
+
xb = b[0]
|
|
274
|
+
yb = b[1]
|
|
275
|
+
xa -= xb
|
|
276
|
+
ya -= yb
|
|
277
|
+
perimeter += Math.hypot(xa, ya)
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
return perimeter
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
function flatten(arr: number[] | number[][]) {
|
|
284
|
+
return arr
|
|
285
|
+
.toString()
|
|
286
|
+
.split(',')
|
|
287
|
+
.map((item) => +item)
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
function cvImread(image: ImageRawType) {
|
|
291
|
+
return cv.matFromImageData(image)
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
function cvImshow(mat: cv.Mat): ImageRawType {
|
|
295
|
+
return new ImageRaw({ data: mat.data, width: mat.cols, height: mat.rows })
|
|
296
|
+
}
|
package/src/index.ts
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
import type { InferenceSession as InferenceSessionCommon, Tensor } from 'onnxruntime-common'
|
|
2
|
+
import invariant from 'tiny-invariant'
|
|
3
|
+
import { ImageRaw, InferenceSession, defaultModels, splitIntoLineImages } from '#common/backend'
|
|
4
|
+
import type { ImageRawData, BrowserImageInput, ImageRaw as ImageRawType, ModelCreateOptions, Size } from '#common/types'
|
|
5
|
+
import { ModelBase } from './ModelBase'
|
|
6
|
+
|
|
7
|
+
const BASE_SIZE = 32
|
|
8
|
+
|
|
9
|
+
export class Detection extends ModelBase {
|
|
10
|
+
static async create({ models, onnxOptions = {}, ...restOptions }: ModelCreateOptions) {
|
|
11
|
+
const detectionPath = models?.detectionPath || defaultModels?.detectionPath
|
|
12
|
+
invariant(detectionPath, 'detectionPath is required')
|
|
13
|
+
const model = await InferenceSession.create(detectionPath, onnxOptions)
|
|
14
|
+
return new Detection({ model, options: restOptions })
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
async run(input: string | ImageRawData | BrowserImageInput, { onnxOptions = {} }: { onnxOptions?: InferenceSessionCommon.RunOptions } = {}) {
|
|
18
|
+
// Use ImageRaw.from() factory method if available (browser), otherwise fallback to legacy handling
|
|
19
|
+
const image = typeof (ImageRaw as any).from === 'function'
|
|
20
|
+
? await (ImageRaw as any).from(input)
|
|
21
|
+
: typeof input === "string"
|
|
22
|
+
? await ImageRaw.open(input)
|
|
23
|
+
: new ImageRaw(input as ImageRawData)
|
|
24
|
+
|
|
25
|
+
// Resize image to multiple of 32
|
|
26
|
+
// - image width and height must be a multiple of 32
|
|
27
|
+
// - bigger image -> more accurate result, but takes longer time
|
|
28
|
+
// inputImage = await Image.resize(image, multipleOfBaseSize(image, { maxSize: 960 }))
|
|
29
|
+
const inputImage = await image.resize(multipleOfBaseSize(image))
|
|
30
|
+
this.debugImage(inputImage, 'out1-multiple-of-base-size.jpg')
|
|
31
|
+
|
|
32
|
+
// Covert image data to model data
|
|
33
|
+
// - Using `(RGB / 255 - mean) / std` formula
|
|
34
|
+
// - omit reshapeOptions (mean/std) is more accurate, can creaet a run option for them
|
|
35
|
+
const modelData = this.imageToInput(inputImage, {
|
|
36
|
+
// mean: [0.485, 0.456, 0.406],
|
|
37
|
+
// std: [0.229, 0.224, 0.225],
|
|
38
|
+
})
|
|
39
|
+
|
|
40
|
+
// Run the model
|
|
41
|
+
// console.time('Detection')
|
|
42
|
+
const modelOutput = await this.runModel({ modelData, onnxOptions })
|
|
43
|
+
// console.timeEnd('Detection')
|
|
44
|
+
|
|
45
|
+
// Convert output data back to image data
|
|
46
|
+
// - output value is from 0 to 1, a probability, if value > 0.3, it is a text
|
|
47
|
+
// - returns a black and white image
|
|
48
|
+
const outputImage = outputToImage(modelOutput, 0.03)
|
|
49
|
+
this.debugImage(outputImage, 'out2-black-white.jpg')
|
|
50
|
+
|
|
51
|
+
// Find text boxes, split image into lines
|
|
52
|
+
// - findContours from the image
|
|
53
|
+
// - returns text boxes and line images
|
|
54
|
+
const lineImages = await splitIntoLineImages(outputImage, inputImage)
|
|
55
|
+
this.debugBoxImage(inputImage, lineImages, 'boxes.jpg')
|
|
56
|
+
|
|
57
|
+
return {
|
|
58
|
+
lineImages,
|
|
59
|
+
resizedImageWidth: inputImage.width,
|
|
60
|
+
resizedImageHeight: inputImage.height,
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
function multipleOfBaseSize(image: ImageRawType, { maxSize }: { maxSize?: number } = {}): Size {
|
|
66
|
+
let width = image.width
|
|
67
|
+
let height = image.height
|
|
68
|
+
if (maxSize && Math.max(width, height) > maxSize) {
|
|
69
|
+
const ratio = width > height ? maxSize / width : maxSize / height
|
|
70
|
+
width = width * ratio
|
|
71
|
+
height = height * ratio
|
|
72
|
+
}
|
|
73
|
+
const newWidth = Math.max(
|
|
74
|
+
// Math.round
|
|
75
|
+
// Math.ceil
|
|
76
|
+
Math.ceil(width / BASE_SIZE) * BASE_SIZE,
|
|
77
|
+
BASE_SIZE,
|
|
78
|
+
)
|
|
79
|
+
const newHeight = Math.max(Math.ceil(height / BASE_SIZE) * BASE_SIZE, BASE_SIZE)
|
|
80
|
+
return { width: newWidth, height: newHeight }
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
function outputToImage(output: Tensor, threshold: number): ImageRawType {
|
|
84
|
+
const height = output.dims[2]
|
|
85
|
+
const width = output.dims[3]
|
|
86
|
+
const data = new Uint8Array(width * height * 4)
|
|
87
|
+
for (const [outIndex, outValue] of output.data.entries()) {
|
|
88
|
+
const n = outIndex * 4
|
|
89
|
+
const value = (outValue as number) > threshold ? 255 : 0
|
|
90
|
+
data[n] = value // R
|
|
91
|
+
data[n + 1] = value // G
|
|
92
|
+
data[n + 2] = value // B
|
|
93
|
+
data[n + 3] = 255 // A
|
|
94
|
+
}
|
|
95
|
+
return new ImageRaw({ data, width, height })
|
|
96
|
+
}
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
import { type InferenceSession as InferenceSessionCommon, Tensor } from 'onnxruntime-common'
|
|
2
|
+
import type {
|
|
3
|
+
ImageRaw,
|
|
4
|
+
InferenceSession,
|
|
5
|
+
LineImage,
|
|
6
|
+
ModelBaseConstructorArg,
|
|
7
|
+
ModelBaseOptions,
|
|
8
|
+
ModelData,
|
|
9
|
+
ReshapeOptions,
|
|
10
|
+
} from '#common/types'
|
|
11
|
+
|
|
12
|
+
export class ModelBase {
|
|
13
|
+
options: ModelBaseOptions
|
|
14
|
+
#model: InferenceSession
|
|
15
|
+
|
|
16
|
+
constructor({ model, options }: ModelBaseConstructorArg) {
|
|
17
|
+
this.#model = model
|
|
18
|
+
this.options = options
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
async runModel({
|
|
22
|
+
modelData,
|
|
23
|
+
onnxOptions = {},
|
|
24
|
+
}: { modelData: ModelData; onnxOptions?: InferenceSessionCommon.RunOptions }) {
|
|
25
|
+
const input = this.#prepareInput(modelData)
|
|
26
|
+
const outputs = await this.#model.run(
|
|
27
|
+
{
|
|
28
|
+
[this.#model.inputNames[0]]: input,
|
|
29
|
+
},
|
|
30
|
+
onnxOptions,
|
|
31
|
+
)
|
|
32
|
+
const output = outputs[this.#model.outputNames[0]]
|
|
33
|
+
return output
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
#prepareInput(modelData: ModelData) {
|
|
37
|
+
const input = Float32Array.from(modelData.data)
|
|
38
|
+
return new Tensor('float32', input, [1, 3, modelData.height, modelData.width])
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
imageToInput(image: ImageRaw, { mean = [0, 0, 0], std = [1, 1, 1] }: ReshapeOptions): ModelData {
|
|
42
|
+
const R: number[] = []
|
|
43
|
+
const G: number[] = []
|
|
44
|
+
const B: number[] = []
|
|
45
|
+
for (let i = 0; i < image.data.length; i += 4) {
|
|
46
|
+
R.push((image.data[i] / 255 - mean[0]) / std[0])
|
|
47
|
+
G.push((image.data[i + 1] / 255 - mean[1]) / std[1])
|
|
48
|
+
B.push((image.data[i + 2] / 255 - mean[2]) / std[2])
|
|
49
|
+
}
|
|
50
|
+
const newData = [...B, ...G, ...R]
|
|
51
|
+
return {
|
|
52
|
+
data: newData,
|
|
53
|
+
width: image.width,
|
|
54
|
+
height: image.height,
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
debugImage(image: ImageRaw | any, path: string) {
|
|
59
|
+
const { debugOutputDir, isDebug } = this.options
|
|
60
|
+
if (!isDebug || !debugOutputDir) {
|
|
61
|
+
return
|
|
62
|
+
}
|
|
63
|
+
image.write(`${debugOutputDir}/${path}`)
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
async debugBoxImage(sourceImage: ImageRaw | any, lineImages: LineImage[], path: string) {
|
|
67
|
+
const { debugOutputDir, isDebug } = this.options
|
|
68
|
+
if (!isDebug || !debugOutputDir) {
|
|
69
|
+
return
|
|
70
|
+
}
|
|
71
|
+
const boxImage = await sourceImage.drawBox(lineImages)
|
|
72
|
+
boxImage.write(`${debugOutputDir}/${path}`)
|
|
73
|
+
}
|
|
74
|
+
}
|
|
@@ -0,0 +1,217 @@
|
|
|
1
|
+
import type { InferenceSession as InferenceSessionCommon, Tensor } from 'onnxruntime-common'
|
|
2
|
+
import invariant from 'tiny-invariant'
|
|
3
|
+
import { FileUtils, InferenceSession, defaultModels } from '#common/backend'
|
|
4
|
+
import type { Dictionary, Line, LineImage, ModelBaseConstructorArg, ModelCreateOptions } from '#common/types'
|
|
5
|
+
import { ModelBase } from './ModelBase'
|
|
6
|
+
|
|
7
|
+
export class Recognition extends ModelBase {
|
|
8
|
+
#dictionary: Dictionary
|
|
9
|
+
|
|
10
|
+
static async create({ models, onnxOptions = {}, ...restOptions }: ModelCreateOptions) {
|
|
11
|
+
const recognitionPath = models?.recognitionPath || defaultModels?.recognitionPath
|
|
12
|
+
invariant(recognitionPath, 'recognitionPath is required')
|
|
13
|
+
const dictionaryPath = models?.dictionaryPath || defaultModels?.dictionaryPath
|
|
14
|
+
invariant(dictionaryPath, 'dictionaryPath is required')
|
|
15
|
+
const model = await InferenceSession.create(recognitionPath, onnxOptions)
|
|
16
|
+
const dictionaryText = await FileUtils.read(dictionaryPath)
|
|
17
|
+
const dictionary = [...dictionaryText.split('\n'), ' ']
|
|
18
|
+
return new Recognition({ model, options: restOptions }, dictionary)
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
constructor(options: ModelBaseConstructorArg, dictionary: Dictionary) {
|
|
22
|
+
super(options)
|
|
23
|
+
this.#dictionary = dictionary
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
async run(lineImages: LineImage[], { onnxOptions = {} }: { onnxOptions?: InferenceSessionCommon.RunOptions } = {}) {
|
|
27
|
+
const modelDatas = await Promise.all(
|
|
28
|
+
// Detect text from each line image
|
|
29
|
+
lineImages.map(async (lineImage, index) => {
|
|
30
|
+
// Resize Image to 48px height
|
|
31
|
+
// - height must <= 48
|
|
32
|
+
// - height: 48 is more accurate then 40, but same as 30
|
|
33
|
+
const image = await (lineImage.image as any).resize({
|
|
34
|
+
height: 48,
|
|
35
|
+
})
|
|
36
|
+
this.debugImage(lineImage.image, `out9-line-${index}.jpg`)
|
|
37
|
+
this.debugImage(image, `out9-line-${index}-resized.jpg`)
|
|
38
|
+
|
|
39
|
+
// transform image data to model data
|
|
40
|
+
const modelData = this.imageToInput(image, {
|
|
41
|
+
// mean: [0.5, 0.5, 0.5],
|
|
42
|
+
// std: [0.5, 0.5, 0.5],
|
|
43
|
+
})
|
|
44
|
+
return modelData
|
|
45
|
+
}),
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
const allLines: Line[] = []
|
|
49
|
+
// console.time('Recognition')
|
|
50
|
+
for (const modelData of modelDatas) {
|
|
51
|
+
// Run model for each line image
|
|
52
|
+
const output = await this.runModel({ modelData, onnxOptions })
|
|
53
|
+
// use Dictoinary to decode output to text
|
|
54
|
+
const lines = await this.decodeText(output)
|
|
55
|
+
allLines.unshift(...lines)
|
|
56
|
+
}
|
|
57
|
+
// console.timeEnd('Recognition')
|
|
58
|
+
const result = calculateBox({ lines: allLines, lineImages })
|
|
59
|
+
return result
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
decodeText(output: Tensor) {
|
|
63
|
+
const data = output
|
|
64
|
+
const predLen = data.dims[2]
|
|
65
|
+
const line: Line[] = []
|
|
66
|
+
let ml = data.dims[0] - 1
|
|
67
|
+
for (let l = 0; l < data.data.length; l += predLen * data.dims[1]) {
|
|
68
|
+
const predsIdx: number[] = []
|
|
69
|
+
const predsProb: number[] = []
|
|
70
|
+
|
|
71
|
+
for (let i = l; i < l + predLen * data.dims[1]; i += predLen) {
|
|
72
|
+
const tmpArr = data.data.slice(i, i + predLen) as Float32Array
|
|
73
|
+
const tmpMax = tmpArr.reduce((a, b) => Math.max(a, b), Number.NEGATIVE_INFINITY)
|
|
74
|
+
const tmpIdx = tmpArr.indexOf(tmpMax)
|
|
75
|
+
predsProb.push(tmpMax)
|
|
76
|
+
predsIdx.push(tmpIdx)
|
|
77
|
+
}
|
|
78
|
+
line[ml] = decode(this.#dictionary, predsIdx, predsProb, true)
|
|
79
|
+
ml--
|
|
80
|
+
}
|
|
81
|
+
return line
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
function decode(dictionary: string[], textIndex: number[], textProb: number[], isRemoveDuplicate: boolean) {
|
|
86
|
+
const ignoredTokens = [0]
|
|
87
|
+
const charList = []
|
|
88
|
+
const confList = []
|
|
89
|
+
for (let idx = 0; idx < textIndex.length; idx++) {
|
|
90
|
+
if (textIndex[idx] in ignoredTokens) {
|
|
91
|
+
continue
|
|
92
|
+
}
|
|
93
|
+
if (isRemoveDuplicate) {
|
|
94
|
+
if (idx > 0 && textIndex[idx - 1] === textIndex[idx]) {
|
|
95
|
+
continue
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
charList.push(dictionary[textIndex[idx] - 1])
|
|
99
|
+
if (textProb) {
|
|
100
|
+
confList.push(textProb[idx])
|
|
101
|
+
} else {
|
|
102
|
+
confList.push(1)
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
let text = ''
|
|
106
|
+
let mean = 0
|
|
107
|
+
if (charList.length) {
|
|
108
|
+
text = charList.join('')
|
|
109
|
+
let sum = 0
|
|
110
|
+
confList.forEach((item) => {
|
|
111
|
+
sum += item
|
|
112
|
+
})
|
|
113
|
+
mean = sum / confList.length
|
|
114
|
+
}
|
|
115
|
+
return { text, mean }
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
function calculateBox({
|
|
119
|
+
lines,
|
|
120
|
+
lineImages,
|
|
121
|
+
}: {
|
|
122
|
+
lines: Line[]
|
|
123
|
+
lineImages: LineImage[]
|
|
124
|
+
}) {
|
|
125
|
+
let mainLine = lines
|
|
126
|
+
const box = lineImages
|
|
127
|
+
for (const i in mainLine) {
|
|
128
|
+
const b = box[mainLine.length - Number(i) - 1].box
|
|
129
|
+
for (const p of b) {
|
|
130
|
+
p[0] = p[0]
|
|
131
|
+
p[1] = p[1]
|
|
132
|
+
}
|
|
133
|
+
mainLine[i]['box'] = b
|
|
134
|
+
}
|
|
135
|
+
mainLine = mainLine.filter((x) => x.mean >= 0.5)
|
|
136
|
+
mainLine = afAfRec(mainLine)
|
|
137
|
+
return mainLine
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
function afAfRec(lines: Line[]) {
|
|
141
|
+
const outputLines: Line[] = []
|
|
142
|
+
const indexes: Map<BoxType, number> = new Map()
|
|
143
|
+
for (const index in lines) {
|
|
144
|
+
const box: any = lines[index].box
|
|
145
|
+
indexes.set(box, Number(index))
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
const groupedBoxes = groupBoxesByMidlineDifference([...indexes.keys()])
|
|
149
|
+
|
|
150
|
+
for (const boxes of groupedBoxes) {
|
|
151
|
+
const texts = []
|
|
152
|
+
let mean = 0
|
|
153
|
+
for (const box of boxes) {
|
|
154
|
+
const index = indexes.get(box)
|
|
155
|
+
if (index === undefined) {
|
|
156
|
+
continue
|
|
157
|
+
}
|
|
158
|
+
const line = lines[index]
|
|
159
|
+
texts.push(line.text)
|
|
160
|
+
mean += line.mean
|
|
161
|
+
}
|
|
162
|
+
let outputBox = undefined
|
|
163
|
+
if (boxes.at(0) && boxes.at(-1)) {
|
|
164
|
+
outputBox = [boxes.at(0)![0], boxes.at(-1)![1], boxes.at(-1)![2], boxes.at(0)![3]]
|
|
165
|
+
}
|
|
166
|
+
outputLines.push({
|
|
167
|
+
mean: mean / boxes.length,
|
|
168
|
+
text: texts.join(' '),
|
|
169
|
+
box: outputBox,
|
|
170
|
+
})
|
|
171
|
+
}
|
|
172
|
+
return outputLines
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
function calculateAverageHeight(boxes: BoxType[]): number {
|
|
176
|
+
let totalHeight = 0
|
|
177
|
+
for (const box of boxes) {
|
|
178
|
+
const [[, y1], , [, y2]] = box
|
|
179
|
+
const height = y2 - y1
|
|
180
|
+
totalHeight += height
|
|
181
|
+
}
|
|
182
|
+
return totalHeight / boxes.length
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
function groupBoxesByMidlineDifference(boxes: BoxType[]): BoxType[][] {
|
|
186
|
+
const averageHeight = calculateAverageHeight(boxes)
|
|
187
|
+
const result: BoxType[][] = []
|
|
188
|
+
for (const box of boxes) {
|
|
189
|
+
const [[, y1], , [, y2]] = box
|
|
190
|
+
const midline = (y1 + y2) / 2
|
|
191
|
+
const group = result.find((b) => {
|
|
192
|
+
const [[, groupY1], , [, groupY2]] = b[0]
|
|
193
|
+
const groupMidline = (groupY1 + groupY2) / 2
|
|
194
|
+
return Math.abs(groupMidline - midline) < averageHeight / 2
|
|
195
|
+
})
|
|
196
|
+
if (group) {
|
|
197
|
+
group.push(box)
|
|
198
|
+
} else {
|
|
199
|
+
result.push([box])
|
|
200
|
+
}
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
for (const group of result) {
|
|
204
|
+
group.sort((a, b) => {
|
|
205
|
+
const [ltA] = a
|
|
206
|
+
const [ltB] = b
|
|
207
|
+
return ltA[0] - ltB[0]
|
|
208
|
+
})
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
result.sort((a, b) => a[0][0][1] - b[0][0][1])
|
|
212
|
+
|
|
213
|
+
return result
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
type pointType = [number, number]
|
|
217
|
+
type BoxType = [pointType, pointType, pointType, pointType]
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export * from './types'
|