mineru 2.6.7__py3-none-any.whl → 2.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. mineru/backend/hybrid/__init__.py +1 -0
  2. mineru/backend/hybrid/hybrid_analyze.py +526 -0
  3. mineru/backend/hybrid/hybrid_magic_model.py +617 -0
  4. mineru/backend/hybrid/hybrid_model_output_to_middle_json.py +212 -0
  5. mineru/backend/pipeline/batch_analyze.py +9 -1
  6. mineru/backend/pipeline/model_init.py +96 -1
  7. mineru/backend/pipeline/pipeline_analyze.py +6 -4
  8. mineru/backend/pipeline/pipeline_middle_json_mkcontent.py +32 -41
  9. mineru/backend/vlm/utils.py +3 -1
  10. mineru/backend/vlm/vlm_analyze.py +12 -12
  11. mineru/backend/vlm/vlm_magic_model.py +24 -89
  12. mineru/backend/vlm/vlm_middle_json_mkcontent.py +118 -19
  13. mineru/cli/client.py +17 -17
  14. mineru/cli/common.py +170 -20
  15. mineru/cli/fast_api.py +39 -13
  16. mineru/cli/gradio_app.py +232 -206
  17. mineru/model/mfd/yolo_v8.py +12 -6
  18. mineru/model/mfr/unimernet/Unimernet.py +71 -3
  19. mineru/resources/header.html +5 -1
  20. mineru/utils/boxbase.py +23 -0
  21. mineru/utils/char_utils.py +55 -0
  22. mineru/utils/engine_utils.py +74 -0
  23. mineru/utils/enum_class.py +18 -1
  24. mineru/utils/magic_model_utils.py +85 -2
  25. mineru/utils/span_pre_proc.py +5 -3
  26. mineru/utils/table_merge.py +5 -21
  27. mineru/version.py +1 -1
  28. mineru-2.7.0.dist-info/METADATA +433 -0
  29. {mineru-2.6.7.dist-info → mineru-2.7.0.dist-info}/RECORD +33 -27
  30. mineru-2.6.7.dist-info/METADATA +0 -954
  31. {mineru-2.6.7.dist-info → mineru-2.7.0.dist-info}/WHEEL +0 -0
  32. {mineru-2.6.7.dist-info → mineru-2.7.0.dist-info}/entry_points.txt +0 -0
  33. {mineru-2.6.7.dist-info → mineru-2.7.0.dist-info}/licenses/LICENSE.md +0 -0
  34. {mineru-2.6.7.dist-info → mineru-2.7.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,212 @@
1
+ # Copyright (c) Opendatalab. All rights reserved.
2
+
3
+ import os
4
+ import time
5
+
6
+ import cv2
7
+ import numpy as np
8
+ from loguru import logger
9
+
10
+ from mineru.backend.hybrid.hybrid_magic_model import MagicModel
11
+ from mineru.backend.utils import cross_page_table_merge
12
+ from mineru.utils.config_reader import get_table_enable, get_llm_aided_config
13
+ from mineru.utils.cut_image import cut_image_and_table
14
+ from mineru.utils.enum_class import ContentType
15
+ from mineru.utils.hash_utils import bytes_md5
16
+ from mineru.utils.ocr_utils import OcrConfidence
17
+ from mineru.utils.pdf_image_tools import get_crop_img
18
+ from mineru.version import __version__
19
+
20
+
21
+ heading_level_import_success = False
22
+ llm_aided_config = get_llm_aided_config()
23
+ if llm_aided_config:
24
+ title_aided_config = llm_aided_config.get('title_aided', {})
25
+ if title_aided_config.get('enable', False):
26
+ try:
27
+ from mineru.utils.llm_aided import llm_aided_title
28
+ from mineru.backend.pipeline.model_init import AtomModelSingleton
29
+ heading_level_import_success = True
30
+ except Exception as e:
31
+ logger.warning("The heading level feature cannot be used. If you need to use the heading level feature, "
32
+ "please execute `pip install mineru[core]` to install the required packages.")
33
+
34
+
35
+ def blocks_to_page_info(
36
+ page_blocks,
37
+ page_inline_formula,
38
+ page_ocr_res,
39
+ image_dict,
40
+ page,
41
+ image_writer,
42
+ page_index,
43
+ _ocr_enable,
44
+ _vlm_ocr_enable,
45
+ ) -> dict:
46
+ """将blocks转换为页面信息"""
47
+
48
+ scale = image_dict["scale"]
49
+ page_pil_img = image_dict["img_pil"]
50
+ page_img_md5 = bytes_md5(page_pil_img.tobytes())
51
+ width, height = map(int, page.get_size())
52
+
53
+ magic_model = MagicModel(
54
+ page_blocks,
55
+ page_inline_formula,
56
+ page_ocr_res,
57
+ page,
58
+ scale,
59
+ page_pil_img,
60
+ width,
61
+ height,
62
+ _ocr_enable,
63
+ _vlm_ocr_enable,
64
+ )
65
+ image_blocks = magic_model.get_image_blocks()
66
+ table_blocks = magic_model.get_table_blocks()
67
+ title_blocks = magic_model.get_title_blocks()
68
+ discarded_blocks = magic_model.get_discarded_blocks()
69
+ code_blocks = magic_model.get_code_blocks()
70
+ ref_text_blocks = magic_model.get_ref_text_blocks()
71
+ phonetic_blocks = magic_model.get_phonetic_blocks()
72
+ list_blocks = magic_model.get_list_blocks()
73
+
74
+ # 如果有标题优化需求,计算标题的平均行高
75
+ if heading_level_import_success:
76
+ if _vlm_ocr_enable: # vlm_ocr导致没有line信息,需要重新det获取平均行高
77
+ atom_model_manager = AtomModelSingleton()
78
+ ocr_model = atom_model_manager.get_atom_model(
79
+ atom_model_name='ocr',
80
+ ocr_show_log=False,
81
+ det_db_box_thresh=0.3,
82
+ lang='ch_lite'
83
+ )
84
+ for title_block in title_blocks:
85
+ title_pil_img = get_crop_img(title_block['bbox'], page_pil_img, scale)
86
+ title_np_img = np.array(title_pil_img)
87
+ # 给title_pil_img添加上下左右各50像素白边padding
88
+ title_np_img = cv2.copyMakeBorder(
89
+ title_np_img, 50, 50, 50, 50, cv2.BORDER_CONSTANT, value=[255, 255, 255]
90
+ )
91
+ title_img = cv2.cvtColor(title_np_img, cv2.COLOR_RGB2BGR)
92
+ ocr_det_res = ocr_model.ocr(title_img, rec=False)[0]
93
+ if len(ocr_det_res) > 0:
94
+ # 计算所有res的平均高度
95
+ avg_height = np.mean([box[2][1] - box[0][1] for box in ocr_det_res])
96
+ title_block['line_avg_height'] = round(avg_height/scale)
97
+ else: # 有line信息,直接计算平均行高
98
+ for title_block in title_blocks:
99
+ lines = title_block.get('lines', [])
100
+ if lines:
101
+ # 使用列表推导式和内置函数,一次性计算平均高度
102
+ avg_height = sum(line['bbox'][3] - line['bbox'][1] for line in lines) / len(lines)
103
+ title_block['line_avg_height'] = round(avg_height)
104
+ else:
105
+ title_block['line_avg_height'] = title_block['bbox'][3] - title_block['bbox'][1]
106
+
107
+ text_blocks = magic_model.get_text_blocks()
108
+ interline_equation_blocks = magic_model.get_interline_equation_blocks()
109
+
110
+ all_spans = magic_model.get_all_spans()
111
+ # 对image/table/interline_equation的span截图
112
+ for span in all_spans:
113
+ if span["type"] in [ContentType.IMAGE, ContentType.TABLE, ContentType.INTERLINE_EQUATION]:
114
+ span = cut_image_and_table(span, page_pil_img, page_img_md5, page_index, image_writer, scale=scale)
115
+
116
+ page_blocks = []
117
+ page_blocks.extend([
118
+ *image_blocks,
119
+ *table_blocks,
120
+ *code_blocks,
121
+ *ref_text_blocks,
122
+ *phonetic_blocks,
123
+ *title_blocks,
124
+ *text_blocks,
125
+ *interline_equation_blocks,
126
+ *list_blocks,
127
+ ])
128
+ # 对page_blocks根据index的值进行排序
129
+ page_blocks.sort(key=lambda x: x["index"])
130
+
131
+ page_info = {"para_blocks": page_blocks, "discarded_blocks": discarded_blocks, "page_size": [width, height], "page_idx": page_index}
132
+ return page_info
133
+
134
+
135
+ def result_to_middle_json(
136
+ model_output_blocks_list,
137
+ inline_formula_list,
138
+ ocr_res_list,
139
+ images_list,
140
+ pdf_doc,
141
+ image_writer,
142
+ _ocr_enable,
143
+ _vlm_ocr_enable,
144
+ hybrid_pipeline_model,
145
+ ):
146
+ middle_json = {
147
+ "pdf_info": [],
148
+ "_backend": "hybrid",
149
+ "_ocr_enable": _ocr_enable,
150
+ "_vlm_ocr_enable": _vlm_ocr_enable,
151
+ "_version_name": __version__
152
+ }
153
+
154
+ for index, (page_blocks, page_inline_formula, page_ocr_res) in enumerate(zip(model_output_blocks_list, inline_formula_list, ocr_res_list)):
155
+ page = pdf_doc[index]
156
+ image_dict = images_list[index]
157
+ page_info = blocks_to_page_info(
158
+ page_blocks, page_inline_formula, page_ocr_res,
159
+ image_dict, page, image_writer, index,
160
+ _ocr_enable, _vlm_ocr_enable
161
+ )
162
+ middle_json["pdf_info"].append(page_info)
163
+
164
+ if not (_vlm_ocr_enable or _ocr_enable):
165
+ """后置ocr处理"""
166
+ need_ocr_list = []
167
+ img_crop_list = []
168
+ text_block_list = []
169
+ for page_info in middle_json["pdf_info"]:
170
+ for block in page_info['para_blocks']:
171
+ if block['type'] in ['table', 'image', 'list', 'code']:
172
+ for sub_block in block['blocks']:
173
+ if not sub_block['type'].endswith('body'):
174
+ text_block_list.append(sub_block)
175
+ elif block['type'] in ['text', 'title', 'ref_text']:
176
+ text_block_list.append(block)
177
+ for block in page_info['discarded_blocks']:
178
+ text_block_list.append(block)
179
+ for block in text_block_list:
180
+ for line in block['lines']:
181
+ for span in line['spans']:
182
+ if 'np_img' in span:
183
+ need_ocr_list.append(span)
184
+ img_crop_list.append(span['np_img'])
185
+ span.pop('np_img')
186
+ if len(img_crop_list) > 0:
187
+ ocr_res_list = hybrid_pipeline_model.ocr_model.ocr(img_crop_list, det=False, tqdm_enable=True)[0]
188
+ assert len(ocr_res_list) == len(
189
+ need_ocr_list), f'ocr_res_list: {len(ocr_res_list)}, need_ocr_list: {len(need_ocr_list)}'
190
+ for index, span in enumerate(need_ocr_list):
191
+ ocr_text, ocr_score = ocr_res_list[index]
192
+ if ocr_score > OcrConfidence.min_confidence:
193
+ span['content'] = ocr_text
194
+ span['score'] = float(f"{ocr_score:.3f}")
195
+ else:
196
+ span['content'] = ''
197
+ span['score'] = 0.0
198
+
199
+ """表格跨页合并"""
200
+ table_enable = get_table_enable(os.getenv('MINERU_VLM_TABLE_ENABLE', 'True').lower() == 'true')
201
+ if table_enable:
202
+ cross_page_table_merge(middle_json["pdf_info"])
203
+
204
+ """llm优化标题分级"""
205
+ if heading_level_import_success:
206
+ llm_aided_title_start_time = time.time()
207
+ llm_aided_title(middle_json["pdf_info"], title_aided_config)
208
+ logger.info(f'llm aided title time: {round(time.time() - llm_aided_title_start_time, 2)}')
209
+
210
+ # 关闭pdf文档
211
+ pdf_doc.close()
212
+ return middle_json
@@ -420,7 +420,15 @@ class BatchAnalyze:
420
420
  layout_res_item['poly'][4], layout_res_item['poly'][5]]
421
421
  layout_res_width = layout_res_bbox[2] - layout_res_bbox[0]
422
422
  layout_res_height = layout_res_bbox[3] - layout_res_bbox[1]
423
- if ocr_text in ['(204号', '(20', '(2', '(2号', '(20号', '号', '(204'] and ocr_score < 0.8 and layout_res_width < layout_res_height:
423
+ if (
424
+ ocr_text in [
425
+ '(204号', '(20', '(2', '(2号', '(20号', '号', '(204',
426
+ '(cid:)', '(ci:)', '(cd:1)', 'cd:)', 'c)', '(cd:)', 'c', 'id:)',
427
+ ':)', '√:)', '√i:)', '−i:)', '−:', 'i:)',
428
+ ]
429
+ and ocr_score < 0.8
430
+ and layout_res_width < layout_res_height
431
+ ):
424
432
  layout_res_item['category_id'] = 16
425
433
 
426
434
  total_processed += len(img_crop_list)
@@ -14,6 +14,7 @@ from ...model.table.cls.paddle_table_cls import PaddleTableClsModel
14
14
  # from ...model.table.rec.RapidTable import RapidTableModel
15
15
  from ...model.table.rec.slanet_plus.main import RapidTableModel
16
16
  from ...model.table.rec.unet_table.main import UnetTableModel
17
+ from ...utils.config_reader import get_device
17
18
  from ...utils.enum_class import ModelPath
18
19
  from ...utils.models_download_utils import auto_download_and_get_model_root_path
19
20
 
@@ -267,4 +268,98 @@ class MineruPipelineModel:
267
268
  lang=self.lang,
268
269
  )
269
270
 
270
- logger.info('DocAnalysis init done!')
271
+ logger.info('DocAnalysis init done!')
272
+
273
+
274
+ class HybridModelSingleton:
275
+ _instance = None
276
+ _models = {}
277
+
278
+ def __new__(cls, *args, **kwargs):
279
+ if cls._instance is None:
280
+ cls._instance = super().__new__(cls)
281
+ return cls._instance
282
+
283
+ def get_model(
284
+ self,
285
+ lang=None,
286
+ formula_enable=None,
287
+ ):
288
+ key = (lang, formula_enable)
289
+ if key not in self._models:
290
+ self._models[key] = MineruHybridModel(
291
+ lang=lang,
292
+ formula_enable=formula_enable,
293
+ )
294
+ return self._models[key]
295
+
296
+ def ocr_det_batch_setting(device):
297
+ # 检测torch的版本号
298
+ import torch
299
+ from packaging import version
300
+ if version.parse(torch.__version__) >= version.parse("2.8.0") or str(device).startswith('mps'):
301
+ enable_ocr_det_batch = False
302
+ else:
303
+ enable_ocr_det_batch = True
304
+ return enable_ocr_det_batch
305
+
306
+ class MineruHybridModel:
307
+ def __init__(
308
+ self,
309
+ device=None,
310
+ lang=None,
311
+ formula_enable=True,
312
+ ):
313
+ if device is not None:
314
+ self.device = device
315
+ else:
316
+ self.device = get_device()
317
+
318
+ self.lang = lang
319
+
320
+ self.enable_ocr_det_batch = ocr_det_batch_setting(self.device)
321
+
322
+ if str(self.device).startswith('npu'):
323
+ try:
324
+ import torch_npu
325
+ if torch_npu.npu.is_available():
326
+ torch_npu.npu.set_compile_mode(jit_compile=False)
327
+ except Exception as e:
328
+ raise RuntimeError(
329
+ "NPU is selected as device, but torch_npu is not available. "
330
+ "Please ensure that the torch_npu package is installed correctly."
331
+ ) from e
332
+
333
+ self.atom_model_manager = AtomModelSingleton()
334
+
335
+ # 初始化OCR模型
336
+ self.ocr_model = self.atom_model_manager.get_atom_model(
337
+ atom_model_name=AtomicModel.OCR,
338
+ det_db_box_thresh=0.3,
339
+ lang=self.lang
340
+ )
341
+
342
+ if formula_enable:
343
+ # 初始化公式检测模型
344
+ self.mfd_model = self.atom_model_manager.get_atom_model(
345
+ atom_model_name=AtomicModel.MFD,
346
+ mfd_weights=str(
347
+ os.path.join(auto_download_and_get_model_root_path(ModelPath.yolo_v8_mfd), ModelPath.yolo_v8_mfd)
348
+ ),
349
+ device=self.device,
350
+ )
351
+
352
+ # 初始化公式解析模型
353
+ if MFR_MODEL == "unimernet_small":
354
+ mfr_model_path = ModelPath.unimernet_small
355
+ elif MFR_MODEL == "pp_formulanet_plus_m":
356
+ mfr_model_path = ModelPath.pp_formulanet_plus_m
357
+ else:
358
+ logger.error('MFR model name not allow')
359
+ exit(1)
360
+
361
+ self.mfr_model = self.atom_model_manager.get_atom_model(
362
+ atom_model_name=AtomicModel.MFR,
363
+ mfr_weight_dir=str(os.path.join(auto_download_and_get_model_root_path(mfr_model_path), mfr_model_path)),
364
+ device=self.device,
365
+ )
@@ -86,6 +86,7 @@ def doc_analyze(
86
86
  all_image_lists = []
87
87
  all_pdf_docs = []
88
88
  ocr_enabled_list = []
89
+ load_images_start = time.time()
89
90
  for pdf_idx, pdf_bytes in enumerate(pdf_bytes_list):
90
91
  # 确定OCR设置
91
92
  _ocr_enable = False
@@ -99,10 +100,7 @@ def doc_analyze(
99
100
  _lang = lang_list[pdf_idx]
100
101
 
101
102
  # 收集每个数据集中的页面
102
- # load_images_start = time.time()
103
103
  images_list, pdf_doc = load_images_from_pdf(pdf_bytes, image_type=ImageType.PIL)
104
- # load_images_time = round(time.time() - load_images_start, 2)
105
- # logger.debug(f"load images cost: {load_images_time}, speed: {round(len(images_list) / load_images_time, 3)} images/s")
106
104
  all_image_lists.append(images_list)
107
105
  all_pdf_docs.append(pdf_doc)
108
106
  for page_idx in range(len(images_list)):
@@ -111,6 +109,8 @@ def doc_analyze(
111
109
  pdf_idx, page_idx,
112
110
  img_dict['img_pil'], _ocr_enable, _lang,
113
111
  ))
112
+ load_images_time = round(time.time() - load_images_start, 2)
113
+ logger.debug(f"load images cost: {load_images_time}, speed: {round(len(all_pages_info) / load_images_time, 3)} images/s")
114
114
 
115
115
  # 准备批处理
116
116
  images_with_extra_info = [(info[2], info[3], info[4]) for info in all_pages_info]
@@ -123,6 +123,7 @@ def doc_analyze(
123
123
  # 执行批处理
124
124
  results = []
125
125
  processed_images_count = 0
126
+ infer_start = time.time()
126
127
  for index, batch_image in enumerate(batch_images):
127
128
  processed_images_count += len(batch_image)
128
129
  logger.info(
@@ -131,6 +132,8 @@ def doc_analyze(
131
132
  )
132
133
  batch_results = batch_image_analyze(batch_image, formula_enable, table_enable)
133
134
  results.extend(batch_results)
135
+ infer_time = round(time.time() - infer_start, 2)
136
+ logger.debug(f"infer finished, cost: {infer_time}, speed: {round(len(results) / infer_time, 3)} page/s")
134
137
 
135
138
  # 构建返回结果
136
139
  infer_results = []
@@ -185,7 +188,6 @@ def batch_image_analyze(
185
188
  batch_ratio = 1
186
189
  logger.info(
187
190
  f'GPU Memory: {gpu_memory} GB, Batch Ratio: {batch_ratio}. '
188
- f'You can set MINERU_VIRTUAL_VRAM_SIZE environment variable to adjust GPU memory allocation.'
189
191
  )
190
192
 
191
193
  # 检测torch的版本号
@@ -1,25 +1,12 @@
1
- import re
2
1
  from loguru import logger
3
2
 
3
+ from mineru.utils.char_utils import full_to_half_exclude_marks, is_hyphen_at_line_end
4
4
  from mineru.utils.config_reader import get_latex_delimiter_config
5
5
  from mineru.backend.pipeline.para_split import ListLineTag
6
6
  from mineru.utils.enum_class import BlockType, ContentType, MakeMode
7
7
  from mineru.utils.language import detect_lang
8
8
 
9
9
 
10
- def __is_hyphen_at_line_end(line):
11
- """Check if a line ends with one or more letters followed by a hyphen.
12
-
13
- Args:
14
- line (str): The line of text to check.
15
-
16
- Returns:
17
- bool: True if the line ends with one or more letters followed by a hyphen, False otherwise.
18
- """
19
- # Use regex to check if the line ends with one or more letters followed by a hyphen
20
- return bool(re.search(r'[A-Za-z]+-\s*$', line))
21
-
22
-
23
10
  def make_blocks_to_markdown(paras_of_layout,
24
11
  mode,
25
12
  img_buket_path='',
@@ -102,25 +89,6 @@ def make_blocks_to_markdown(paras_of_layout,
102
89
  return page_markdown
103
90
 
104
91
 
105
- def full_to_half(text: str) -> str:
106
- """Convert full-width characters to half-width characters using code point manipulation.
107
-
108
- Args:
109
- text: String containing full-width characters
110
-
111
- Returns:
112
- String with full-width characters converted to half-width
113
- """
114
- result = []
115
- for char in text:
116
- code = ord(char)
117
- # Full-width letters and numbers (FF21-FF3A for A-Z, FF41-FF5A for a-z, FF10-FF19 for 0-9)
118
- if (0xFF21 <= code <= 0xFF3A) or (0xFF41 <= code <= 0xFF5A) or (0xFF10 <= code <= 0xFF19):
119
- result.append(chr(code - 0xFEE0)) # Shift to ASCII range
120
- else:
121
- result.append(char)
122
- return ''.join(result)
123
-
124
92
  latex_delimiters_config = get_latex_delimiter_config()
125
93
 
126
94
  default_delimiters = {
@@ -140,7 +108,7 @@ def merge_para_with_text(para_block):
140
108
  for line in para_block['lines']:
141
109
  for span in line['spans']:
142
110
  if span['type'] in [ContentType.TEXT]:
143
- span['content'] = full_to_half(span['content'])
111
+ span['content'] = full_to_half_exclude_marks(span['content'])
144
112
  block_text += span['content']
145
113
  block_lang = detect_lang(block_text)
146
114
 
@@ -166,22 +134,45 @@ def merge_para_with_text(para_block):
166
134
  content = content.strip()
167
135
 
168
136
  if content:
169
- langs = ['zh', 'ja', 'ko']
137
+
138
+ if span_type == ContentType.INTERLINE_EQUATION:
139
+ para_text += content
140
+ continue
141
+
142
+ # 定义CJK语言集合(中日韩)
143
+ cjk_langs = {'zh', 'ja', 'ko'}
170
144
  # logger.info(f'block_lang: {block_lang}, content: {content}')
171
- if block_lang in langs: # 中文/日语/韩文语境下,换行不需要空格分隔,但是如果是行内公式结尾,还是要加空格
172
- if j == len(line['spans']) - 1 and span_type not in [ContentType.INLINE_EQUATION]:
145
+
146
+ # 判断是否为行末span
147
+ is_last_span = j == len(line['spans']) - 1
148
+
149
+ if block_lang in cjk_langs: # 中文/日语/韩文语境下,换行不需要空格分隔,但是如果是行内公式结尾,还是要加空格
150
+ if is_last_span and span_type not in [ContentType.INLINE_EQUATION]:
173
151
  para_text += content
174
152
  else:
175
153
  para_text += f'{content} '
176
154
  else:
155
+ # 西方文本语境下 每行的最后一个span判断是否要去除连字符
177
156
  if span_type in [ContentType.TEXT, ContentType.INLINE_EQUATION]:
178
157
  # 如果span是line的最后一个且末尾带有-连字符,那么末尾不应该加空格,同时应该把-删除
179
- if j == len(line['spans'])-1 and span_type == ContentType.TEXT and __is_hyphen_at_line_end(content):
180
- para_text += content[:-1]
158
+ if (
159
+ is_last_span
160
+ and span_type == ContentType.TEXT
161
+ and is_hyphen_at_line_end(content)
162
+ ):
163
+ # 如果下一行的第一个span是小写字母开头,删除连字符
164
+ if (
165
+ i + 1 < len(para_block['lines'])
166
+ and para_block['lines'][i + 1].get('spans')
167
+ and para_block['lines'][i + 1]['spans'][0].get('type') == ContentType.TEXT
168
+ and para_block['lines'][i + 1]['spans'][0].get('content', '')
169
+ and para_block['lines'][i + 1]['spans'][0]['content'][0].islower()
170
+ ):
171
+ para_text += content[:-1]
172
+ else: # 如果没有下一行,或者下一行的第一个span不是小写字母开头,则保留连字符但不加空格
173
+ para_text += content
181
174
  else: # 西方文本语境下 content间需要空格分隔
182
175
  para_text += f'{content} '
183
- elif span_type == ContentType.INTERLINE_EQUATION:
184
- para_text += content
185
176
  else:
186
177
  continue
187
178
 
@@ -72,7 +72,9 @@ def set_lmdeploy_backend(device_type: str) -> str:
72
72
 
73
73
  def set_default_gpu_memory_utilization() -> float:
74
74
  from vllm import __version__ as vllm_version
75
- if version.parse(vllm_version) >= version.parse("0.11.0"):
75
+ device = get_device()
76
+ gpu_memory = get_vram(device)
77
+ if version.parse(vllm_version) >= version.parse("0.11.0") and gpu_memory <= 8:
76
78
  return 0.7
77
79
  else:
78
80
  return 0.5
@@ -202,16 +202,16 @@ def doc_analyze(
202
202
  if predictor is None:
203
203
  predictor = ModelSingleton().get_model(backend, model_path, server_url, **kwargs)
204
204
 
205
- # load_images_start = time.time()
205
+ load_images_start = time.time()
206
206
  images_list, pdf_doc = load_images_from_pdf(pdf_bytes, image_type=ImageType.PIL)
207
207
  images_pil_list = [image_dict["img_pil"] for image_dict in images_list]
208
- # load_images_time = round(time.time() - load_images_start, 2)
209
- # logger.info(f"load images cost: {load_images_time}, speed: {round(len(images_base64_list)/load_images_time, 3)} images/s")
208
+ load_images_time = round(time.time() - load_images_start, 2)
209
+ logger.debug(f"load images cost: {load_images_time}, speed: {round(len(images_pil_list)/load_images_time, 3)} images/s")
210
210
 
211
- # infer_start = time.time()
211
+ infer_start = time.time()
212
212
  results = predictor.batch_two_step_extract(images=images_pil_list)
213
- # infer_time = round(time.time() - infer_start, 2)
214
- # logger.info(f"infer finished, cost: {infer_time}, speed: {round(len(results)/infer_time, 3)} page/s")
213
+ infer_time = round(time.time() - infer_start, 2)
214
+ logger.debug(f"infer finished, cost: {infer_time}, speed: {round(len(results)/infer_time, 3)} page/s")
215
215
 
216
216
  middle_json = result_to_middle_json(results, images_list, pdf_doc, image_writer)
217
217
  return middle_json, results
@@ -229,15 +229,15 @@ async def aio_doc_analyze(
229
229
  if predictor is None:
230
230
  predictor = ModelSingleton().get_model(backend, model_path, server_url, **kwargs)
231
231
 
232
- # load_images_start = time.time()
232
+ load_images_start = time.time()
233
233
  images_list, pdf_doc = load_images_from_pdf(pdf_bytes, image_type=ImageType.PIL)
234
234
  images_pil_list = [image_dict["img_pil"] for image_dict in images_list]
235
- # load_images_time = round(time.time() - load_images_start, 2)
236
- # logger.debug(f"load images cost: {load_images_time}, speed: {round(len(images_pil_list)/load_images_time, 3)} images/s")
235
+ load_images_time = round(time.time() - load_images_start, 2)
236
+ logger.debug(f"load images cost: {load_images_time}, speed: {round(len(images_pil_list)/load_images_time, 3)} images/s")
237
237
 
238
- # infer_start = time.time()
238
+ infer_start = time.time()
239
239
  results = await predictor.aio_batch_two_step_extract(images=images_pil_list)
240
- # infer_time = round(time.time() - infer_start, 2)
241
- # logger.info(f"infer finished, cost: {infer_time}, speed: {round(len(results)/infer_time, 3)} page/s")
240
+ infer_time = round(time.time() - infer_start, 2)
241
+ logger.debug(f"infer finished, cost: {infer_time}, speed: {round(len(results)/infer_time, 3)} page/s")
242
242
  middle_json = result_to_middle_json(results, images_list, pdf_doc, image_writer)
243
243
  return middle_json, results