doc-page-extractor 0.1.1__py3-none-any.whl → 1.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. doc_page_extractor/__init__.py +5 -14
  2. doc_page_extractor/check_env.py +40 -0
  3. doc_page_extractor/extractor.py +87 -212
  4. doc_page_extractor/model.py +97 -0
  5. doc_page_extractor/parser.py +51 -0
  6. doc_page_extractor/plot.py +52 -79
  7. doc_page_extractor/redacter.py +111 -0
  8. doc_page_extractor-1.0.2.dist-info/METADATA +120 -0
  9. doc_page_extractor-1.0.2.dist-info/RECORD +11 -0
  10. {doc_page_extractor-0.1.1.dist-info → doc_page_extractor-1.0.2.dist-info}/WHEEL +1 -2
  11. doc_page_extractor-1.0.2.dist-info/licenses/LICENSE +21 -0
  12. doc_page_extractor/clipper.py +0 -119
  13. doc_page_extractor/downloader.py +0 -16
  14. doc_page_extractor/latex.py +0 -57
  15. doc_page_extractor/layout_order.py +0 -240
  16. doc_page_extractor/layoutreader.py +0 -126
  17. doc_page_extractor/ocr.py +0 -175
  18. doc_page_extractor/ocr_corrector.py +0 -126
  19. doc_page_extractor/onnxocr/__init__.py +0 -1
  20. doc_page_extractor/onnxocr/cls_postprocess.py +0 -26
  21. doc_page_extractor/onnxocr/db_postprocess.py +0 -246
  22. doc_page_extractor/onnxocr/imaug.py +0 -32
  23. doc_page_extractor/onnxocr/operators.py +0 -187
  24. doc_page_extractor/onnxocr/predict_base.py +0 -52
  25. doc_page_extractor/onnxocr/predict_cls.py +0 -89
  26. doc_page_extractor/onnxocr/predict_det.py +0 -120
  27. doc_page_extractor/onnxocr/predict_rec.py +0 -321
  28. doc_page_extractor/onnxocr/predict_system.py +0 -97
  29. doc_page_extractor/onnxocr/rec_postprocess.py +0 -896
  30. doc_page_extractor/onnxocr/utils.py +0 -71
  31. doc_page_extractor/overlap.py +0 -167
  32. doc_page_extractor/raw_optimizer.py +0 -104
  33. doc_page_extractor/rectangle.py +0 -72
  34. doc_page_extractor/rotation.py +0 -158
  35. doc_page_extractor/struct_eqtable/__init__.py +0 -49
  36. doc_page_extractor/struct_eqtable/internvl/__init__.py +0 -2
  37. doc_page_extractor/struct_eqtable/internvl/conversation.py +0 -394
  38. doc_page_extractor/struct_eqtable/internvl/internvl.py +0 -198
  39. doc_page_extractor/struct_eqtable/internvl/internvl_lmdeploy.py +0 -81
  40. doc_page_extractor/struct_eqtable/pix2s/__init__.py +0 -3
  41. doc_page_extractor/struct_eqtable/pix2s/pix2s.py +0 -76
  42. doc_page_extractor/struct_eqtable/pix2s/pix2s_trt.py +0 -1047
  43. doc_page_extractor/table.py +0 -71
  44. doc_page_extractor/types.py +0 -67
  45. doc_page_extractor/utils.py +0 -32
  46. doc_page_extractor-0.1.1.dist-info/METADATA +0 -84
  47. doc_page_extractor-0.1.1.dist-info/RECORD +0 -44
  48. doc_page_extractor-0.1.1.dist-info/licenses/LICENSE +0 -661
  49. doc_page_extractor-0.1.1.dist-info/top_level.txt +0 -2
  50. tests/__init__.py +0 -0
  51. tests/test_history_bus.py +0 -55
@@ -1,120 +0,0 @@
1
- import numpy as np
2
- from .imaug import transform, create_operators
3
- from .db_postprocess import DBPostProcess
4
- from .predict_base import PredictBase
5
-
6
-
7
- class TextDetector(PredictBase):
8
- def __init__(self, args):
9
- self.args = args
10
- self.det_algorithm = args.det_algorithm
11
- pre_process_list = [
12
- {
13
- "DetResizeForTest": {
14
- "limit_side_len": args.det_limit_side_len,
15
- "limit_type": args.det_limit_type,
16
- }
17
- },
18
- {
19
- "NormalizeImage": {
20
- "std": [0.229, 0.224, 0.225],
21
- "mean": [0.485, 0.456, 0.406],
22
- "scale": "1./255.",
23
- "order": "hwc",
24
- }
25
- },
26
- {"ToCHWImage": None},
27
- {"KeepKeys": {"keep_keys": ["image", "shape"]}},
28
- ]
29
- postprocess_params = {}
30
- postprocess_params["name"] = "DBPostProcess"
31
- postprocess_params["thresh"] = args.det_db_thresh
32
- postprocess_params["box_thresh"] = args.det_db_box_thresh
33
- postprocess_params["max_candidates"] = 1000
34
- postprocess_params["unclip_ratio"] = args.det_db_unclip_ratio
35
- postprocess_params["use_dilation"] = args.use_dilation
36
- postprocess_params["score_mode"] = args.det_db_score_mode
37
- postprocess_params["box_type"] = args.det_box_type
38
-
39
- # 实例化预处理操作类
40
- self.preprocess_op = create_operators(pre_process_list)
41
- # self.postprocess_op = build_post_process(postprocess_params)
42
- # 实例化后处理操作类
43
- self.postprocess_op = DBPostProcess(**postprocess_params)
44
-
45
- # 初始化模型
46
- self.det_onnx_session = self.get_onnx_session(args.det_model_dir, args.use_gpu)
47
- self.det_input_name = self.get_input_name(self.det_onnx_session)
48
- self.det_output_name = self.get_output_name(self.det_onnx_session)
49
-
50
- def order_points_clockwise(self, pts):
51
- rect = np.zeros((4, 2), dtype="float32")
52
- s = pts.sum(axis=1)
53
- rect[0] = pts[np.argmin(s)]
54
- rect[2] = pts[np.argmax(s)]
55
- tmp = np.delete(pts, (np.argmin(s), np.argmax(s)), axis=0)
56
- diff = np.diff(np.array(tmp), axis=1)
57
- rect[1] = tmp[np.argmin(diff)]
58
- rect[3] = tmp[np.argmax(diff)]
59
- return rect
60
-
61
- def clip_det_res(self, points, img_height, img_width):
62
- for pno in range(points.shape[0]):
63
- points[pno, 0] = int(min(max(points[pno, 0], 0), img_width - 1))
64
- points[pno, 1] = int(min(max(points[pno, 1], 0), img_height - 1))
65
- return points
66
-
67
- def filter_tag_det_res(self, dt_boxes, image_shape):
68
- img_height, img_width = image_shape[0:2]
69
- dt_boxes_new = []
70
- for box in dt_boxes:
71
- if type(box) is list:
72
- box = np.array(box)
73
- box = self.order_points_clockwise(box)
74
- box = self.clip_det_res(box, img_height, img_width)
75
- rect_width = int(np.linalg.norm(box[0] - box[1]))
76
- rect_height = int(np.linalg.norm(box[0] - box[3]))
77
- if rect_width <= 3 or rect_height <= 3:
78
- continue
79
- dt_boxes_new.append(box)
80
- dt_boxes = np.array(dt_boxes_new)
81
- return dt_boxes
82
-
83
- def filter_tag_det_res_only_clip(self, dt_boxes, image_shape):
84
- img_height, img_width = image_shape[0:2]
85
- dt_boxes_new = []
86
- for box in dt_boxes:
87
- if type(box) is list:
88
- box = np.array(box)
89
- box = self.clip_det_res(box, img_height, img_width)
90
- dt_boxes_new.append(box)
91
- dt_boxes = np.array(dt_boxes_new)
92
- return dt_boxes
93
-
94
- def __call__(self, img):
95
- ori_im = img.copy()
96
- data = {"image": img}
97
-
98
- data = transform(data, self.preprocess_op)
99
- img, shape_list = data
100
- if img is None:
101
- return None, 0
102
- img = np.expand_dims(img, axis=0)
103
- shape_list = np.expand_dims(shape_list, axis=0)
104
- img = img.copy()
105
-
106
- input_feed = self.get_input_feed(self.det_input_name, img)
107
- outputs = self.det_onnx_session.run(self.det_output_name, input_feed=input_feed)
108
-
109
- preds = {}
110
- preds["maps"] = outputs[0]
111
-
112
- post_result = self.postprocess_op(preds, shape_list)
113
- dt_boxes = post_result[0]["points"]
114
-
115
- if self.args.det_box_type == "poly":
116
- dt_boxes = self.filter_tag_det_res_only_clip(dt_boxes, ori_im.shape)
117
- else:
118
- dt_boxes = self.filter_tag_det_res(dt_boxes, ori_im.shape)
119
-
120
- return dt_boxes
@@ -1,321 +0,0 @@
1
- import cv2
2
- import numpy as np
3
- import math
4
- from PIL import Image
5
-
6
-
7
- from .rec_postprocess import CTCLabelDecode
8
- from .predict_base import PredictBase
9
-
10
-
11
- class TextRecognizer(PredictBase):
12
- def __init__(self, args):
13
- self.rec_image_shape = args.rec_image_shape
14
- self.rec_batch_num = args.rec_batch_num
15
- self.rec_algorithm = args.rec_algorithm
16
- self.postprocess_op = CTCLabelDecode(
17
- character_dict_path=args.rec_char_dict_path,
18
- use_space_char=args.use_space_char,
19
- )
20
-
21
- # 初始化模型
22
- self.rec_onnx_session = self.get_onnx_session(args.rec_model_dir, args.use_gpu)
23
- self.rec_input_name = self.get_input_name(self.rec_onnx_session)
24
- self.rec_output_name = self.get_output_name(self.rec_onnx_session)
25
-
26
- def resize_norm_img(self, img, max_wh_ratio):
27
- imgC, imgH, imgW = self.rec_image_shape
28
- if self.rec_algorithm == "NRTR" or self.rec_algorithm == "ViTSTR":
29
- img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
30
- # return padding_im
31
- image_pil = Image.fromarray(np.uint8(img))
32
- if self.rec_algorithm == "ViTSTR":
33
- img = image_pil.resize([imgW, imgH], Image.BICUBIC)
34
- else:
35
- img = image_pil.resize([imgW, imgH], Image.ANTIALIAS)
36
- img = np.array(img)
37
- norm_img = np.expand_dims(img, -1)
38
- norm_img = norm_img.transpose((2, 0, 1))
39
- if self.rec_algorithm == "ViTSTR":
40
- norm_img = norm_img.astype(np.float32) / 255.0
41
- else:
42
- norm_img = norm_img.astype(np.float32) / 128.0 - 1.0
43
- return norm_img
44
- elif self.rec_algorithm == "RFL":
45
- img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
46
- resized_image = cv2.resize(img, (imgW, imgH), interpolation=cv2.INTER_CUBIC)
47
- resized_image = resized_image.astype("float32")
48
- resized_image = resized_image / 255
49
- resized_image = resized_image[np.newaxis, :]
50
- resized_image -= 0.5
51
- resized_image /= 0.5
52
- return resized_image
53
-
54
- assert imgC == img.shape[2]
55
- imgW = int((imgH * max_wh_ratio))
56
-
57
- # w = self.rec_onnx_session.get_inputs()[0].shape[3:][0]
58
- # w = self.rec_onnx_session.get_inputs()[0].shape[3:][0]
59
- # print(w)
60
- # if w is not None and w > 0:
61
- # imgW = w
62
-
63
- h, w = img.shape[:2]
64
- ratio = w / float(h)
65
- if math.ceil(imgH * ratio) > imgW:
66
- resized_w = imgW
67
- else:
68
- resized_w = int(math.ceil(imgH * ratio))
69
- if self.rec_algorithm == "RARE":
70
- if resized_w > self.rec_image_shape[2]:
71
- resized_w = self.rec_image_shape[2]
72
- imgW = self.rec_image_shape[2]
73
- resized_image = cv2.resize(img, (resized_w, imgH))
74
- resized_image = resized_image.astype("float32")
75
- resized_image = resized_image.transpose((2, 0, 1)) / 255
76
- resized_image -= 0.5
77
- resized_image /= 0.5
78
- padding_im = np.zeros((imgC, imgH, imgW), dtype=np.float32)
79
- padding_im[:, :, 0:resized_w] = resized_image
80
- return padding_im
81
-
82
- def resize_norm_img_vl(self, img, image_shape):
83
- imgC, imgH, imgW = image_shape
84
- img = img[:, :, ::-1] # bgr2rgb
85
- resized_image = cv2.resize(img, (imgW, imgH), interpolation=cv2.INTER_LINEAR)
86
- resized_image = resized_image.astype("float32")
87
- resized_image = resized_image.transpose((2, 0, 1)) / 255
88
- return resized_image
89
-
90
- def resize_norm_img_srn(self, img, image_shape):
91
- imgC, imgH, imgW = image_shape
92
-
93
- img_black = np.zeros((imgH, imgW))
94
- im_hei = img.shape[0]
95
- im_wid = img.shape[1]
96
-
97
- if im_wid <= im_hei * 1:
98
- img_new = cv2.resize(img, (imgH * 1, imgH))
99
- elif im_wid <= im_hei * 2:
100
- img_new = cv2.resize(img, (imgH * 2, imgH))
101
- elif im_wid <= im_hei * 3:
102
- img_new = cv2.resize(img, (imgH * 3, imgH))
103
- else:
104
- img_new = cv2.resize(img, (imgW, imgH))
105
-
106
- img_np = np.asarray(img_new)
107
- img_np = cv2.cvtColor(img_np, cv2.COLOR_BGR2GRAY)
108
- img_black[:, 0 : img_np.shape[1]] = img_np
109
- img_black = img_black[:, :, np.newaxis]
110
-
111
- row, col, c = img_black.shape
112
- c = 1
113
-
114
- return np.reshape(img_black, (c, row, col)).astype(np.float32)
115
-
116
- def srn_other_inputs(self, image_shape, num_heads, max_text_length):
117
- imgC, imgH, imgW = image_shape
118
- feature_dim = int((imgH / 8) * (imgW / 8))
119
-
120
- encoder_word_pos = (
121
- np.array(range(0, feature_dim)).reshape((feature_dim, 1)).astype("int64")
122
- )
123
- gsrm_word_pos = (
124
- np.array(range(0, max_text_length))
125
- .reshape((max_text_length, 1))
126
- .astype("int64")
127
- )
128
-
129
- gsrm_attn_bias_data = np.ones((1, max_text_length, max_text_length))
130
- gsrm_slf_attn_bias1 = np.triu(gsrm_attn_bias_data, 1).reshape(
131
- [-1, 1, max_text_length, max_text_length]
132
- )
133
- gsrm_slf_attn_bias1 = np.tile(gsrm_slf_attn_bias1, [1, num_heads, 1, 1]).astype(
134
- "float32"
135
- ) * [-1e9]
136
-
137
- gsrm_slf_attn_bias2 = np.tril(gsrm_attn_bias_data, -1).reshape(
138
- [-1, 1, max_text_length, max_text_length]
139
- )
140
- gsrm_slf_attn_bias2 = np.tile(gsrm_slf_attn_bias2, [1, num_heads, 1, 1]).astype(
141
- "float32"
142
- ) * [-1e9]
143
-
144
- encoder_word_pos = encoder_word_pos[np.newaxis, :]
145
- gsrm_word_pos = gsrm_word_pos[np.newaxis, :]
146
-
147
- return [
148
- encoder_word_pos,
149
- gsrm_word_pos,
150
- gsrm_slf_attn_bias1,
151
- gsrm_slf_attn_bias2,
152
- ]
153
-
154
- def process_image_srn(self, img, image_shape, num_heads, max_text_length):
155
- norm_img = self.resize_norm_img_srn(img, image_shape)
156
- norm_img = norm_img[np.newaxis, :]
157
-
158
- [encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1, gsrm_slf_attn_bias2] = (
159
- self.srn_other_inputs(image_shape, num_heads, max_text_length)
160
- )
161
-
162
- gsrm_slf_attn_bias1 = gsrm_slf_attn_bias1.astype(np.float32)
163
- gsrm_slf_attn_bias2 = gsrm_slf_attn_bias2.astype(np.float32)
164
- encoder_word_pos = encoder_word_pos.astype(np.int64)
165
- gsrm_word_pos = gsrm_word_pos.astype(np.int64)
166
-
167
- return (
168
- norm_img,
169
- encoder_word_pos,
170
- gsrm_word_pos,
171
- gsrm_slf_attn_bias1,
172
- gsrm_slf_attn_bias2,
173
- )
174
-
175
- def resize_norm_img_sar(self, img, image_shape, width_downsample_ratio=0.25):
176
- imgC, imgH, imgW_min, imgW_max = image_shape
177
- h = img.shape[0]
178
- w = img.shape[1]
179
- valid_ratio = 1.0
180
- # make sure new_width is an integral multiple of width_divisor.
181
- width_divisor = int(1 / width_downsample_ratio)
182
- # resize
183
- ratio = w / float(h)
184
- resize_w = math.ceil(imgH * ratio)
185
- if resize_w % width_divisor != 0:
186
- resize_w = round(resize_w / width_divisor) * width_divisor
187
- if imgW_min is not None:
188
- resize_w = max(imgW_min, resize_w)
189
- if imgW_max is not None:
190
- valid_ratio = min(1.0, 1.0 * resize_w / imgW_max)
191
- resize_w = min(imgW_max, resize_w)
192
- resized_image = cv2.resize(img, (resize_w, imgH))
193
- resized_image = resized_image.astype("float32")
194
- # norm
195
- if image_shape[0] == 1:
196
- resized_image = resized_image / 255
197
- resized_image = resized_image[np.newaxis, :]
198
- else:
199
- resized_image = resized_image.transpose((2, 0, 1)) / 255
200
- resized_image -= 0.5
201
- resized_image /= 0.5
202
- resize_shape = resized_image.shape
203
- padding_im = -1.0 * np.ones((imgC, imgH, imgW_max), dtype=np.float32)
204
- padding_im[:, :, 0:resize_w] = resized_image
205
- pad_shape = padding_im.shape
206
-
207
- return padding_im, resize_shape, pad_shape, valid_ratio
208
-
209
- def resize_norm_img_spin(self, img):
210
- img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
211
- # return padding_im
212
- img = cv2.resize(img, tuple([100, 32]), cv2.INTER_CUBIC)
213
- img = np.array(img, np.float32)
214
- img = np.expand_dims(img, -1)
215
- img = img.transpose((2, 0, 1))
216
- mean = [127.5]
217
- std = [127.5]
218
- mean = np.array(mean, dtype=np.float32)
219
- std = np.array(std, dtype=np.float32)
220
- mean = np.float32(mean.reshape(1, -1))
221
- stdinv = 1 / np.float32(std.reshape(1, -1))
222
- img -= mean
223
- img *= stdinv
224
- return img
225
-
226
- def resize_norm_img_svtr(self, img, image_shape):
227
- imgC, imgH, imgW = image_shape
228
- resized_image = cv2.resize(img, (imgW, imgH), interpolation=cv2.INTER_LINEAR)
229
- resized_image = resized_image.astype("float32")
230
- resized_image = resized_image.transpose((2, 0, 1)) / 255
231
- resized_image -= 0.5
232
- resized_image /= 0.5
233
- return resized_image
234
-
235
- def resize_norm_img_abinet(self, img, image_shape):
236
- imgC, imgH, imgW = image_shape
237
-
238
- resized_image = cv2.resize(img, (imgW, imgH), interpolation=cv2.INTER_LINEAR)
239
- resized_image = resized_image.astype("float32")
240
- resized_image = resized_image / 255.0
241
-
242
- mean = np.array([0.485, 0.456, 0.406])
243
- std = np.array([0.229, 0.224, 0.225])
244
- resized_image = (resized_image - mean[None, None, ...]) / std[None, None, ...]
245
- resized_image = resized_image.transpose((2, 0, 1))
246
- resized_image = resized_image.astype("float32")
247
-
248
- return resized_image
249
-
250
- def norm_img_can(self, img, image_shape):
251
- img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) # CAN only predict gray scale image
252
-
253
- if self.inverse:
254
- img = 255 - img
255
-
256
- if self.rec_image_shape[0] == 1:
257
- h, w = img.shape
258
- _, imgH, imgW = self.rec_image_shape
259
- if h < imgH or w < imgW:
260
- padding_h = max(imgH - h, 0)
261
- padding_w = max(imgW - w, 0)
262
- img_padded = np.pad(
263
- img,
264
- ((0, padding_h), (0, padding_w)),
265
- "constant",
266
- constant_values=(255),
267
- )
268
- img = img_padded
269
-
270
- img = np.expand_dims(img, 0) / 255.0 # h,w,c -> c,h,w
271
- img = img.astype("float32")
272
-
273
- return img
274
-
275
- def __call__(self, img_list):
276
- img_num = len(img_list)
277
- # Calculate the aspect ratio of all text bars
278
- width_list = []
279
- for img in img_list:
280
- width_list.append(img.shape[1] / float(img.shape[0]))
281
- # Sorting can speed up the recognition process
282
- indices = np.argsort(np.array(width_list))
283
- rec_res = [["", 0.0]] * img_num
284
- batch_num = self.rec_batch_num
285
-
286
- for beg_img_no in range(0, img_num, batch_num):
287
- end_img_no = min(img_num, beg_img_no + batch_num)
288
- norm_img_batch = []
289
- imgC, imgH, imgW = self.rec_image_shape[:3]
290
- max_wh_ratio = imgW / imgH
291
- # max_wh_ratio = 0
292
- for ino in range(beg_img_no, end_img_no):
293
- h, w = img_list[indices[ino]].shape[0:2]
294
- wh_ratio = w * 1.0 / h
295
- max_wh_ratio = max(max_wh_ratio, wh_ratio)
296
- for ino in range(beg_img_no, end_img_no):
297
- norm_img = self.resize_norm_img(img_list[indices[ino]], max_wh_ratio)
298
- norm_img = norm_img[np.newaxis, :]
299
- norm_img_batch.append(norm_img)
300
-
301
- norm_img_batch = np.concatenate(norm_img_batch)
302
- norm_img_batch = norm_img_batch.copy()
303
-
304
- # img = img[:, :, ::-1].transpose(2, 0, 1)
305
- # img = img[:, :, ::-1]
306
- # img = img.transpose(2, 0, 1)
307
- # img = img.astype(np.float32)
308
- # img = np.expand_dims(img, axis=0)
309
- # print(img.shape)
310
- input_feed = self.get_input_feed(self.rec_input_name, norm_img_batch)
311
- outputs = self.rec_onnx_session.run(
312
- self.rec_output_name, input_feed=input_feed
313
- )
314
-
315
- preds = outputs[0]
316
-
317
- rec_result = self.postprocess_op(preds)
318
- for rno in range(len(rec_result)):
319
- rec_res[indices[beg_img_no + rno]] = rec_result[rno]
320
-
321
- return rec_res
@@ -1,97 +0,0 @@
1
- import os
2
- import cv2
3
- import copy
4
-
5
- from . import predict_det
6
- from . import predict_cls
7
- from . import predict_rec
8
- from .utils import get_rotate_crop_image, get_minarea_rect_crop
9
-
10
- class TextSystem:
11
- def __init__(self, args):
12
- self.text_detector = predict_det.TextDetector(args)
13
- self.text_recognizer = predict_rec.TextRecognizer(args)
14
- self.use_angle_cls = True
15
- self.drop_score = args.drop_score
16
- if self.use_angle_cls:
17
- self.text_classifier = predict_cls.TextClassifier(args)
18
-
19
- self.args = args
20
- self.crop_image_res_index = 0
21
-
22
- def draw_crop_rec_res(self, output_dir, img_crop_list, rec_res):
23
- os.makedirs(output_dir, exist_ok=True)
24
- bbox_num = len(img_crop_list)
25
- for bno in range(bbox_num):
26
- cv2.imwrite(
27
- os.path.join(
28
- output_dir, f"mg_crop_{bno + self.crop_image_res_index}.jpg"
29
- ),
30
- img_crop_list[bno],
31
- )
32
-
33
- self.crop_image_res_index += bbox_num
34
-
35
- def __call__(self, img, cls=True):
36
- ori_im = img.copy()
37
- # 文字检测
38
- dt_boxes = self.text_detector(img)
39
-
40
- if dt_boxes is None:
41
- return None, None
42
-
43
- img_crop_list = []
44
-
45
- dt_boxes = sorted_boxes(dt_boxes)
46
-
47
- # 图片裁剪
48
- for bno in range(len(dt_boxes)):
49
- tmp_box = copy.deepcopy(dt_boxes[bno])
50
- if self.args.det_box_type == "quad":
51
- img_crop = get_rotate_crop_image(ori_im, tmp_box)
52
- else:
53
- img_crop = get_minarea_rect_crop(ori_im, tmp_box)
54
- img_crop_list.append(img_crop)
55
-
56
- # 方向分类
57
- if self.use_angle_cls and cls:
58
- img_crop_list, angle_list = self.text_classifier(img_crop_list)
59
-
60
- # 图像识别
61
- rec_res = self.text_recognizer(img_crop_list)
62
-
63
- if self.args.save_crop_res:
64
- self.draw_crop_rec_res(self.args.crop_res_save_dir, img_crop_list, rec_res)
65
- filter_boxes, filter_rec_res = [], []
66
- for box, rec_result in zip(dt_boxes, rec_res):
67
- text, score = rec_result
68
- if score >= self.drop_score:
69
- filter_boxes.append(box)
70
- filter_rec_res.append(rec_result)
71
-
72
- return filter_boxes, filter_rec_res
73
-
74
-
75
- def sorted_boxes(dt_boxes):
76
- """
77
- Sort text boxes in order from top to bottom, left to right
78
- args:
79
- dt_boxes(array):detected text boxes with shape [4, 2]
80
- return:
81
- sorted boxes(array) with shape [4, 2]
82
- """
83
- num_boxes = dt_boxes.shape[0]
84
- sorted_boxes = sorted(dt_boxes, key=lambda x: (x[0][1], x[0][0]))
85
- _boxes = list(sorted_boxes)
86
-
87
- for i in range(num_boxes - 1):
88
- for j in range(i, -1, -1):
89
- if abs(_boxes[j + 1][0][1] - _boxes[j][0][1]) < 10 and (
90
- _boxes[j + 1][0][0] < _boxes[j][0][0]
91
- ):
92
- tmp = _boxes[j]
93
- _boxes[j] = _boxes[j + 1]
94
- _boxes[j + 1] = tmp
95
- else:
96
- break
97
- return _boxes