doc-page-extractor 0.0.5__py3-none-any.whl → 0.0.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of doc-page-extractor might be problematic. Click here for more details.

@@ -0,0 +1,321 @@
1
+ import cv2
2
+ import numpy as np
3
+ import math
4
+ from PIL import Image
5
+
6
+
7
+ from .rec_postprocess import CTCLabelDecode
8
+ from .predict_base import PredictBase
9
+
10
+
11
+ class TextRecognizer(PredictBase):
12
+ def __init__(self, args):
13
+ self.rec_image_shape = args.rec_image_shape
14
+ self.rec_batch_num = args.rec_batch_num
15
+ self.rec_algorithm = args.rec_algorithm
16
+ self.postprocess_op = CTCLabelDecode(
17
+ character_dict_path=args.rec_char_dict_path,
18
+ use_space_char=args.use_space_char,
19
+ )
20
+
21
+ # 初始化模型
22
+ self.rec_onnx_session = self.get_onnx_session(args.rec_model_dir, args.use_gpu)
23
+ self.rec_input_name = self.get_input_name(self.rec_onnx_session)
24
+ self.rec_output_name = self.get_output_name(self.rec_onnx_session)
25
+
26
+ def resize_norm_img(self, img, max_wh_ratio):
27
+ imgC, imgH, imgW = self.rec_image_shape
28
+ if self.rec_algorithm == "NRTR" or self.rec_algorithm == "ViTSTR":
29
+ img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
30
+ # return padding_im
31
+ image_pil = Image.fromarray(np.uint8(img))
32
+ if self.rec_algorithm == "ViTSTR":
33
+ img = image_pil.resize([imgW, imgH], Image.BICUBIC)
34
+ else:
35
+ img = image_pil.resize([imgW, imgH], Image.ANTIALIAS)
36
+ img = np.array(img)
37
+ norm_img = np.expand_dims(img, -1)
38
+ norm_img = norm_img.transpose((2, 0, 1))
39
+ if self.rec_algorithm == "ViTSTR":
40
+ norm_img = norm_img.astype(np.float32) / 255.0
41
+ else:
42
+ norm_img = norm_img.astype(np.float32) / 128.0 - 1.0
43
+ return norm_img
44
+ elif self.rec_algorithm == "RFL":
45
+ img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
46
+ resized_image = cv2.resize(img, (imgW, imgH), interpolation=cv2.INTER_CUBIC)
47
+ resized_image = resized_image.astype("float32")
48
+ resized_image = resized_image / 255
49
+ resized_image = resized_image[np.newaxis, :]
50
+ resized_image -= 0.5
51
+ resized_image /= 0.5
52
+ return resized_image
53
+
54
+ assert imgC == img.shape[2]
55
+ imgW = int((imgH * max_wh_ratio))
56
+
57
+ # w = self.rec_onnx_session.get_inputs()[0].shape[3:][0]
58
+ # w = self.rec_onnx_session.get_inputs()[0].shape[3:][0]
59
+ # print(w)
60
+ # if w is not None and w > 0:
61
+ # imgW = w
62
+
63
+ h, w = img.shape[:2]
64
+ ratio = w / float(h)
65
+ if math.ceil(imgH * ratio) > imgW:
66
+ resized_w = imgW
67
+ else:
68
+ resized_w = int(math.ceil(imgH * ratio))
69
+ if self.rec_algorithm == "RARE":
70
+ if resized_w > self.rec_image_shape[2]:
71
+ resized_w = self.rec_image_shape[2]
72
+ imgW = self.rec_image_shape[2]
73
+ resized_image = cv2.resize(img, (resized_w, imgH))
74
+ resized_image = resized_image.astype("float32")
75
+ resized_image = resized_image.transpose((2, 0, 1)) / 255
76
+ resized_image -= 0.5
77
+ resized_image /= 0.5
78
+ padding_im = np.zeros((imgC, imgH, imgW), dtype=np.float32)
79
+ padding_im[:, :, 0:resized_w] = resized_image
80
+ return padding_im
81
+
82
+ def resize_norm_img_vl(self, img, image_shape):
83
+ imgC, imgH, imgW = image_shape
84
+ img = img[:, :, ::-1] # bgr2rgb
85
+ resized_image = cv2.resize(img, (imgW, imgH), interpolation=cv2.INTER_LINEAR)
86
+ resized_image = resized_image.astype("float32")
87
+ resized_image = resized_image.transpose((2, 0, 1)) / 255
88
+ return resized_image
89
+
90
+ def resize_norm_img_srn(self, img, image_shape):
91
+ imgC, imgH, imgW = image_shape
92
+
93
+ img_black = np.zeros((imgH, imgW))
94
+ im_hei = img.shape[0]
95
+ im_wid = img.shape[1]
96
+
97
+ if im_wid <= im_hei * 1:
98
+ img_new = cv2.resize(img, (imgH * 1, imgH))
99
+ elif im_wid <= im_hei * 2:
100
+ img_new = cv2.resize(img, (imgH * 2, imgH))
101
+ elif im_wid <= im_hei * 3:
102
+ img_new = cv2.resize(img, (imgH * 3, imgH))
103
+ else:
104
+ img_new = cv2.resize(img, (imgW, imgH))
105
+
106
+ img_np = np.asarray(img_new)
107
+ img_np = cv2.cvtColor(img_np, cv2.COLOR_BGR2GRAY)
108
+ img_black[:, 0 : img_np.shape[1]] = img_np
109
+ img_black = img_black[:, :, np.newaxis]
110
+
111
+ row, col, c = img_black.shape
112
+ c = 1
113
+
114
+ return np.reshape(img_black, (c, row, col)).astype(np.float32)
115
+
116
+ def srn_other_inputs(self, image_shape, num_heads, max_text_length):
117
+ imgC, imgH, imgW = image_shape
118
+ feature_dim = int((imgH / 8) * (imgW / 8))
119
+
120
+ encoder_word_pos = (
121
+ np.array(range(0, feature_dim)).reshape((feature_dim, 1)).astype("int64")
122
+ )
123
+ gsrm_word_pos = (
124
+ np.array(range(0, max_text_length))
125
+ .reshape((max_text_length, 1))
126
+ .astype("int64")
127
+ )
128
+
129
+ gsrm_attn_bias_data = np.ones((1, max_text_length, max_text_length))
130
+ gsrm_slf_attn_bias1 = np.triu(gsrm_attn_bias_data, 1).reshape(
131
+ [-1, 1, max_text_length, max_text_length]
132
+ )
133
+ gsrm_slf_attn_bias1 = np.tile(gsrm_slf_attn_bias1, [1, num_heads, 1, 1]).astype(
134
+ "float32"
135
+ ) * [-1e9]
136
+
137
+ gsrm_slf_attn_bias2 = np.tril(gsrm_attn_bias_data, -1).reshape(
138
+ [-1, 1, max_text_length, max_text_length]
139
+ )
140
+ gsrm_slf_attn_bias2 = np.tile(gsrm_slf_attn_bias2, [1, num_heads, 1, 1]).astype(
141
+ "float32"
142
+ ) * [-1e9]
143
+
144
+ encoder_word_pos = encoder_word_pos[np.newaxis, :]
145
+ gsrm_word_pos = gsrm_word_pos[np.newaxis, :]
146
+
147
+ return [
148
+ encoder_word_pos,
149
+ gsrm_word_pos,
150
+ gsrm_slf_attn_bias1,
151
+ gsrm_slf_attn_bias2,
152
+ ]
153
+
154
+ def process_image_srn(self, img, image_shape, num_heads, max_text_length):
155
+ norm_img = self.resize_norm_img_srn(img, image_shape)
156
+ norm_img = norm_img[np.newaxis, :]
157
+
158
+ [encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1, gsrm_slf_attn_bias2] = (
159
+ self.srn_other_inputs(image_shape, num_heads, max_text_length)
160
+ )
161
+
162
+ gsrm_slf_attn_bias1 = gsrm_slf_attn_bias1.astype(np.float32)
163
+ gsrm_slf_attn_bias2 = gsrm_slf_attn_bias2.astype(np.float32)
164
+ encoder_word_pos = encoder_word_pos.astype(np.int64)
165
+ gsrm_word_pos = gsrm_word_pos.astype(np.int64)
166
+
167
+ return (
168
+ norm_img,
169
+ encoder_word_pos,
170
+ gsrm_word_pos,
171
+ gsrm_slf_attn_bias1,
172
+ gsrm_slf_attn_bias2,
173
+ )
174
+
175
+ def resize_norm_img_sar(self, img, image_shape, width_downsample_ratio=0.25):
176
+ imgC, imgH, imgW_min, imgW_max = image_shape
177
+ h = img.shape[0]
178
+ w = img.shape[1]
179
+ valid_ratio = 1.0
180
+ # make sure new_width is an integral multiple of width_divisor.
181
+ width_divisor = int(1 / width_downsample_ratio)
182
+ # resize
183
+ ratio = w / float(h)
184
+ resize_w = math.ceil(imgH * ratio)
185
+ if resize_w % width_divisor != 0:
186
+ resize_w = round(resize_w / width_divisor) * width_divisor
187
+ if imgW_min is not None:
188
+ resize_w = max(imgW_min, resize_w)
189
+ if imgW_max is not None:
190
+ valid_ratio = min(1.0, 1.0 * resize_w / imgW_max)
191
+ resize_w = min(imgW_max, resize_w)
192
+ resized_image = cv2.resize(img, (resize_w, imgH))
193
+ resized_image = resized_image.astype("float32")
194
+ # norm
195
+ if image_shape[0] == 1:
196
+ resized_image = resized_image / 255
197
+ resized_image = resized_image[np.newaxis, :]
198
+ else:
199
+ resized_image = resized_image.transpose((2, 0, 1)) / 255
200
+ resized_image -= 0.5
201
+ resized_image /= 0.5
202
+ resize_shape = resized_image.shape
203
+ padding_im = -1.0 * np.ones((imgC, imgH, imgW_max), dtype=np.float32)
204
+ padding_im[:, :, 0:resize_w] = resized_image
205
+ pad_shape = padding_im.shape
206
+
207
+ return padding_im, resize_shape, pad_shape, valid_ratio
208
+
209
+ def resize_norm_img_spin(self, img):
210
+ img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
211
+ # return padding_im
212
+ img = cv2.resize(img, tuple([100, 32]), cv2.INTER_CUBIC)
213
+ img = np.array(img, np.float32)
214
+ img = np.expand_dims(img, -1)
215
+ img = img.transpose((2, 0, 1))
216
+ mean = [127.5]
217
+ std = [127.5]
218
+ mean = np.array(mean, dtype=np.float32)
219
+ std = np.array(std, dtype=np.float32)
220
+ mean = np.float32(mean.reshape(1, -1))
221
+ stdinv = 1 / np.float32(std.reshape(1, -1))
222
+ img -= mean
223
+ img *= stdinv
224
+ return img
225
+
226
+ def resize_norm_img_svtr(self, img, image_shape):
227
+ imgC, imgH, imgW = image_shape
228
+ resized_image = cv2.resize(img, (imgW, imgH), interpolation=cv2.INTER_LINEAR)
229
+ resized_image = resized_image.astype("float32")
230
+ resized_image = resized_image.transpose((2, 0, 1)) / 255
231
+ resized_image -= 0.5
232
+ resized_image /= 0.5
233
+ return resized_image
234
+
235
+ def resize_norm_img_abinet(self, img, image_shape):
236
+ imgC, imgH, imgW = image_shape
237
+
238
+ resized_image = cv2.resize(img, (imgW, imgH), interpolation=cv2.INTER_LINEAR)
239
+ resized_image = resized_image.astype("float32")
240
+ resized_image = resized_image / 255.0
241
+
242
+ mean = np.array([0.485, 0.456, 0.406])
243
+ std = np.array([0.229, 0.224, 0.225])
244
+ resized_image = (resized_image - mean[None, None, ...]) / std[None, None, ...]
245
+ resized_image = resized_image.transpose((2, 0, 1))
246
+ resized_image = resized_image.astype("float32")
247
+
248
+ return resized_image
249
+
250
+ def norm_img_can(self, img, image_shape):
251
+ img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) # CAN only predict gray scale image
252
+
253
+ if self.inverse:
254
+ img = 255 - img
255
+
256
+ if self.rec_image_shape[0] == 1:
257
+ h, w = img.shape
258
+ _, imgH, imgW = self.rec_image_shape
259
+ if h < imgH or w < imgW:
260
+ padding_h = max(imgH - h, 0)
261
+ padding_w = max(imgW - w, 0)
262
+ img_padded = np.pad(
263
+ img,
264
+ ((0, padding_h), (0, padding_w)),
265
+ "constant",
266
+ constant_values=(255),
267
+ )
268
+ img = img_padded
269
+
270
+ img = np.expand_dims(img, 0) / 255.0 # h,w,c -> c,h,w
271
+ img = img.astype("float32")
272
+
273
+ return img
274
+
275
+ def __call__(self, img_list):
276
+ img_num = len(img_list)
277
+ # Calculate the aspect ratio of all text bars
278
+ width_list = []
279
+ for img in img_list:
280
+ width_list.append(img.shape[1] / float(img.shape[0]))
281
+ # Sorting can speed up the recognition process
282
+ indices = np.argsort(np.array(width_list))
283
+ rec_res = [["", 0.0]] * img_num
284
+ batch_num = self.rec_batch_num
285
+
286
+ for beg_img_no in range(0, img_num, batch_num):
287
+ end_img_no = min(img_num, beg_img_no + batch_num)
288
+ norm_img_batch = []
289
+ imgC, imgH, imgW = self.rec_image_shape[:3]
290
+ max_wh_ratio = imgW / imgH
291
+ # max_wh_ratio = 0
292
+ for ino in range(beg_img_no, end_img_no):
293
+ h, w = img_list[indices[ino]].shape[0:2]
294
+ wh_ratio = w * 1.0 / h
295
+ max_wh_ratio = max(max_wh_ratio, wh_ratio)
296
+ for ino in range(beg_img_no, end_img_no):
297
+ norm_img = self.resize_norm_img(img_list[indices[ino]], max_wh_ratio)
298
+ norm_img = norm_img[np.newaxis, :]
299
+ norm_img_batch.append(norm_img)
300
+
301
+ norm_img_batch = np.concatenate(norm_img_batch)
302
+ norm_img_batch = norm_img_batch.copy()
303
+
304
+ # img = img[:, :, ::-1].transpose(2, 0, 1)
305
+ # img = img[:, :, ::-1]
306
+ # img = img.transpose(2, 0, 1)
307
+ # img = img.astype(np.float32)
308
+ # img = np.expand_dims(img, axis=0)
309
+ # print(img.shape)
310
+ input_feed = self.get_input_feed(self.rec_input_name, norm_img_batch)
311
+ outputs = self.rec_onnx_session.run(
312
+ self.rec_output_name, input_feed=input_feed
313
+ )
314
+
315
+ preds = outputs[0]
316
+
317
+ rec_result = self.postprocess_op(preds)
318
+ for rno in range(len(rec_result)):
319
+ rec_res[indices[beg_img_no + rno]] = rec_result[rno]
320
+
321
+ return rec_res
@@ -0,0 +1,97 @@
1
+ import os
2
+ import cv2
3
+ import copy
4
+
5
+ from . import predict_det
6
+ from . import predict_cls
7
+ from . import predict_rec
8
+ from .utils import get_rotate_crop_image, get_minarea_rect_crop
9
+
10
+ class TextSystem:
11
+ def __init__(self, args):
12
+ self.text_detector = predict_det.TextDetector(args)
13
+ self.text_recognizer = predict_rec.TextRecognizer(args)
14
+ self.use_angle_cls = True
15
+ self.drop_score = args.drop_score
16
+ if self.use_angle_cls:
17
+ self.text_classifier = predict_cls.TextClassifier(args)
18
+
19
+ self.args = args
20
+ self.crop_image_res_index = 0
21
+
22
+ def draw_crop_rec_res(self, output_dir, img_crop_list, rec_res):
23
+ os.makedirs(output_dir, exist_ok=True)
24
+ bbox_num = len(img_crop_list)
25
+ for bno in range(bbox_num):
26
+ cv2.imwrite(
27
+ os.path.join(
28
+ output_dir, f"mg_crop_{bno + self.crop_image_res_index}.jpg"
29
+ ),
30
+ img_crop_list[bno],
31
+ )
32
+
33
+ self.crop_image_res_index += bbox_num
34
+
35
+ def __call__(self, img, cls=True):
36
+ ori_im = img.copy()
37
+ # 文字检测
38
+ dt_boxes = self.text_detector(img)
39
+
40
+ if dt_boxes is None:
41
+ return None, None
42
+
43
+ img_crop_list = []
44
+
45
+ dt_boxes = sorted_boxes(dt_boxes)
46
+
47
+ # 图片裁剪
48
+ for bno in range(len(dt_boxes)):
49
+ tmp_box = copy.deepcopy(dt_boxes[bno])
50
+ if self.args.det_box_type == "quad":
51
+ img_crop = get_rotate_crop_image(ori_im, tmp_box)
52
+ else:
53
+ img_crop = get_minarea_rect_crop(ori_im, tmp_box)
54
+ img_crop_list.append(img_crop)
55
+
56
+ # 方向分类
57
+ if self.use_angle_cls and cls:
58
+ img_crop_list, angle_list = self.text_classifier(img_crop_list)
59
+
60
+ # 图像识别
61
+ rec_res = self.text_recognizer(img_crop_list)
62
+
63
+ if self.args.save_crop_res:
64
+ self.draw_crop_rec_res(self.args.crop_res_save_dir, img_crop_list, rec_res)
65
+ filter_boxes, filter_rec_res = [], []
66
+ for box, rec_result in zip(dt_boxes, rec_res):
67
+ text, score = rec_result
68
+ if score >= self.drop_score:
69
+ filter_boxes.append(box)
70
+ filter_rec_res.append(rec_result)
71
+
72
+ return filter_boxes, filter_rec_res
73
+
74
+
75
+ def sorted_boxes(dt_boxes):
76
+ """
77
+ Sort text boxes in order from top to bottom, left to right
78
+ args:
79
+ dt_boxes(array):detected text boxes with shape [4, 2]
80
+ return:
81
+ sorted boxes(array) with shape [4, 2]
82
+ """
83
+ num_boxes = dt_boxes.shape[0]
84
+ sorted_boxes = sorted(dt_boxes, key=lambda x: (x[0][1], x[0][0]))
85
+ _boxes = list(sorted_boxes)
86
+
87
+ for i in range(num_boxes - 1):
88
+ for j in range(i, -1, -1):
89
+ if abs(_boxes[j + 1][0][1] - _boxes[j][0][1]) < 10 and (
90
+ _boxes[j + 1][0][0] < _boxes[j][0][0]
91
+ ):
92
+ tmp = _boxes[j]
93
+ _boxes[j] = _boxes[j + 1]
94
+ _boxes[j + 1] = tmp
95
+ else:
96
+ break
97
+ return _boxes