paddlex 3.0.0rc1__py3-none-any.whl → 3.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (240) hide show
  1. paddlex/.version +1 -1
  2. paddlex/__init__.py +1 -1
  3. paddlex/configs/modules/chart_parsing/PP-Chart2Table.yaml +13 -0
  4. paddlex/configs/modules/doc_vlm/PP-DocBee2-3B.yaml +14 -0
  5. paddlex/configs/modules/formula_recognition/PP-FormulaNet_plus-L.yaml +40 -0
  6. paddlex/configs/modules/formula_recognition/PP-FormulaNet_plus-M.yaml +40 -0
  7. paddlex/configs/modules/formula_recognition/PP-FormulaNet_plus-S.yaml +40 -0
  8. paddlex/configs/modules/layout_detection/PP-DocBlockLayout.yaml +40 -0
  9. paddlex/configs/modules/layout_detection/PP-DocLayout-L.yaml +2 -2
  10. paddlex/configs/modules/layout_detection/PP-DocLayout-M.yaml +2 -2
  11. paddlex/configs/modules/layout_detection/PP-DocLayout-S.yaml +2 -2
  12. paddlex/configs/modules/layout_detection/PP-DocLayout_plus-L.yaml +40 -0
  13. paddlex/configs/modules/text_detection/PP-OCRv5_mobile_det.yaml +40 -0
  14. paddlex/configs/modules/text_detection/PP-OCRv5_server_det.yaml +40 -0
  15. paddlex/configs/modules/text_recognition/PP-OCRv5_mobile_rec.yaml +39 -0
  16. paddlex/configs/modules/text_recognition/PP-OCRv5_server_rec.yaml +39 -0
  17. paddlex/configs/modules/textline_orientation/PP-LCNet_x1_0_textline_ori.yaml +41 -0
  18. paddlex/configs/pipelines/OCR.yaml +7 -6
  19. paddlex/configs/pipelines/PP-ChatOCRv3-doc.yaml +3 -1
  20. paddlex/configs/pipelines/PP-ChatOCRv4-doc.yaml +91 -34
  21. paddlex/configs/pipelines/PP-StructureV3.yaml +72 -72
  22. paddlex/configs/pipelines/doc_understanding.yaml +1 -1
  23. paddlex/configs/pipelines/formula_recognition.yaml +2 -2
  24. paddlex/configs/pipelines/layout_parsing.yaml +3 -2
  25. paddlex/configs/pipelines/seal_recognition.yaml +1 -0
  26. paddlex/configs/pipelines/table_recognition.yaml +2 -1
  27. paddlex/configs/pipelines/table_recognition_v2.yaml +7 -1
  28. paddlex/hpip_links.html +20 -20
  29. paddlex/inference/common/batch_sampler/doc_vlm_batch_sampler.py +33 -10
  30. paddlex/inference/common/batch_sampler/image_batch_sampler.py +34 -25
  31. paddlex/inference/common/result/mixin.py +19 -12
  32. paddlex/inference/models/base/predictor/base_predictor.py +2 -8
  33. paddlex/inference/models/common/static_infer.py +29 -73
  34. paddlex/inference/models/common/tokenizer/__init__.py +2 -0
  35. paddlex/inference/models/common/tokenizer/clip_tokenizer.py +1 -1
  36. paddlex/inference/models/common/tokenizer/gpt_tokenizer.py +2 -2
  37. paddlex/inference/models/common/tokenizer/qwen2_5_tokenizer.py +112 -0
  38. paddlex/inference/models/common/tokenizer/qwen2_tokenizer.py +7 -1
  39. paddlex/inference/models/common/tokenizer/qwen_tokenizer.py +288 -0
  40. paddlex/inference/models/common/tokenizer/tokenizer_utils.py +13 -13
  41. paddlex/inference/models/common/tokenizer/tokenizer_utils_base.py +3 -3
  42. paddlex/inference/models/common/tokenizer/vocab.py +7 -7
  43. paddlex/inference/models/common/ts/funcs.py +19 -8
  44. paddlex/inference/models/common/vlm/conversion_utils.py +99 -0
  45. paddlex/inference/models/common/vlm/fusion_ops.py +205 -0
  46. paddlex/inference/models/common/vlm/generation/configuration_utils.py +1 -1
  47. paddlex/inference/models/common/vlm/generation/logits_process.py +1 -1
  48. paddlex/inference/models/common/vlm/generation/utils.py +1 -1
  49. paddlex/inference/models/common/vlm/transformers/configuration_utils.py +3 -3
  50. paddlex/inference/models/common/vlm/transformers/conversion_utils.py +3 -3
  51. paddlex/inference/models/common/vlm/transformers/model_outputs.py +2 -2
  52. paddlex/inference/models/common/vlm/transformers/model_utils.py +7 -31
  53. paddlex/inference/models/doc_vlm/modeling/GOT_ocr_2_0.py +830 -0
  54. paddlex/inference/models/doc_vlm/modeling/__init__.py +2 -0
  55. paddlex/inference/models/doc_vlm/modeling/qwen2.py +1606 -0
  56. paddlex/inference/models/doc_vlm/modeling/qwen2_5_vl.py +3006 -0
  57. paddlex/inference/models/doc_vlm/modeling/qwen2_vl.py +0 -105
  58. paddlex/inference/models/doc_vlm/predictor.py +79 -24
  59. paddlex/inference/models/doc_vlm/processors/GOT_ocr_2_0.py +97 -0
  60. paddlex/inference/models/doc_vlm/processors/__init__.py +2 -0
  61. paddlex/inference/models/doc_vlm/processors/common.py +189 -0
  62. paddlex/inference/models/doc_vlm/processors/qwen2_5_vl.py +548 -0
  63. paddlex/inference/models/doc_vlm/processors/qwen2_vl.py +21 -176
  64. paddlex/inference/models/formula_recognition/predictor.py +8 -2
  65. paddlex/inference/models/formula_recognition/processors.py +90 -77
  66. paddlex/inference/models/formula_recognition/result.py +28 -27
  67. paddlex/inference/models/image_feature/processors.py +3 -4
  68. paddlex/inference/models/keypoint_detection/predictor.py +3 -0
  69. paddlex/inference/models/object_detection/predictor.py +2 -0
  70. paddlex/inference/models/object_detection/processors.py +28 -3
  71. paddlex/inference/models/object_detection/utils.py +2 -0
  72. paddlex/inference/models/table_structure_recognition/result.py +0 -10
  73. paddlex/inference/models/text_detection/predictor.py +8 -0
  74. paddlex/inference/models/text_detection/processors.py +44 -10
  75. paddlex/inference/models/text_detection/result.py +0 -10
  76. paddlex/inference/models/text_recognition/result.py +1 -1
  77. paddlex/inference/pipelines/__init__.py +9 -5
  78. paddlex/inference/pipelines/_parallel.py +172 -0
  79. paddlex/inference/pipelines/anomaly_detection/pipeline.py +16 -6
  80. paddlex/inference/pipelines/attribute_recognition/pipeline.py +11 -1
  81. paddlex/inference/pipelines/base.py +14 -4
  82. paddlex/inference/pipelines/components/faisser.py +1 -1
  83. paddlex/inference/pipelines/doc_preprocessor/pipeline.py +53 -27
  84. paddlex/inference/pipelines/formula_recognition/pipeline.py +120 -82
  85. paddlex/inference/pipelines/formula_recognition/result.py +1 -11
  86. paddlex/inference/pipelines/image_classification/pipeline.py +16 -6
  87. paddlex/inference/pipelines/image_multilabel_classification/pipeline.py +16 -6
  88. paddlex/inference/pipelines/instance_segmentation/pipeline.py +16 -6
  89. paddlex/inference/pipelines/keypoint_detection/pipeline.py +16 -6
  90. paddlex/inference/pipelines/layout_parsing/layout_objects.py +859 -0
  91. paddlex/inference/pipelines/layout_parsing/pipeline.py +34 -47
  92. paddlex/inference/pipelines/layout_parsing/pipeline_v2.py +832 -260
  93. paddlex/inference/pipelines/layout_parsing/result.py +4 -17
  94. paddlex/inference/pipelines/layout_parsing/result_v2.py +259 -245
  95. paddlex/inference/pipelines/layout_parsing/setting.py +88 -0
  96. paddlex/inference/pipelines/layout_parsing/utils.py +391 -2028
  97. paddlex/inference/pipelines/layout_parsing/xycut_enhanced/__init__.py +16 -0
  98. paddlex/inference/pipelines/layout_parsing/xycut_enhanced/utils.py +1199 -0
  99. paddlex/inference/pipelines/layout_parsing/xycut_enhanced/xycuts.py +615 -0
  100. paddlex/inference/pipelines/m_3d_bev_detection/pipeline.py +2 -2
  101. paddlex/inference/pipelines/multilingual_speech_recognition/pipeline.py +2 -2
  102. paddlex/inference/pipelines/object_detection/pipeline.py +16 -6
  103. paddlex/inference/pipelines/ocr/pipeline.py +127 -70
  104. paddlex/inference/pipelines/ocr/result.py +21 -18
  105. paddlex/inference/pipelines/open_vocabulary_detection/pipeline.py +2 -2
  106. paddlex/inference/pipelines/open_vocabulary_segmentation/pipeline.py +2 -2
  107. paddlex/inference/pipelines/pp_chatocr/pipeline_base.py +2 -2
  108. paddlex/inference/pipelines/pp_chatocr/pipeline_v3.py +2 -5
  109. paddlex/inference/pipelines/pp_chatocr/pipeline_v4.py +6 -6
  110. paddlex/inference/pipelines/rotated_object_detection/pipeline.py +16 -6
  111. paddlex/inference/pipelines/seal_recognition/pipeline.py +109 -53
  112. paddlex/inference/pipelines/semantic_segmentation/pipeline.py +16 -6
  113. paddlex/inference/pipelines/small_object_detection/pipeline.py +16 -6
  114. paddlex/inference/pipelines/table_recognition/pipeline.py +26 -18
  115. paddlex/inference/pipelines/table_recognition/pipeline_v2.py +624 -53
  116. paddlex/inference/pipelines/table_recognition/result.py +1 -1
  117. paddlex/inference/pipelines/table_recognition/table_recognition_post_processing_v2.py +9 -5
  118. paddlex/inference/pipelines/ts_anomaly_detection/pipeline.py +2 -2
  119. paddlex/inference/pipelines/ts_classification/pipeline.py +2 -2
  120. paddlex/inference/pipelines/ts_forecasting/pipeline.py +2 -2
  121. paddlex/inference/pipelines/video_classification/pipeline.py +2 -2
  122. paddlex/inference/pipelines/video_detection/pipeline.py +2 -2
  123. paddlex/inference/serving/basic_serving/_app.py +46 -13
  124. paddlex/inference/serving/basic_serving/_pipeline_apps/_common/common.py +5 -1
  125. paddlex/inference/serving/basic_serving/_pipeline_apps/layout_parsing.py +0 -1
  126. paddlex/inference/serving/basic_serving/_pipeline_apps/pp_chatocrv3_doc.py +0 -1
  127. paddlex/inference/serving/basic_serving/_pipeline_apps/pp_chatocrv4_doc.py +1 -1
  128. paddlex/inference/serving/basic_serving/_pipeline_apps/pp_structurev3.py +6 -2
  129. paddlex/inference/serving/basic_serving/_pipeline_apps/table_recognition.py +1 -5
  130. paddlex/inference/serving/basic_serving/_pipeline_apps/table_recognition_v2.py +4 -5
  131. paddlex/inference/serving/infra/utils.py +20 -22
  132. paddlex/inference/serving/schemas/formula_recognition.py +1 -1
  133. paddlex/inference/serving/schemas/layout_parsing.py +1 -2
  134. paddlex/inference/serving/schemas/pp_chatocrv3_doc.py +1 -2
  135. paddlex/inference/serving/schemas/pp_chatocrv4_doc.py +2 -2
  136. paddlex/inference/serving/schemas/pp_structurev3.py +10 -6
  137. paddlex/inference/serving/schemas/seal_recognition.py +1 -1
  138. paddlex/inference/serving/schemas/table_recognition.py +2 -6
  139. paddlex/inference/serving/schemas/table_recognition_v2.py +5 -6
  140. paddlex/inference/utils/hpi.py +30 -16
  141. paddlex/inference/utils/hpi_model_info_collection.json +666 -162
  142. paddlex/inference/utils/io/readers.py +12 -12
  143. paddlex/inference/utils/misc.py +20 -0
  144. paddlex/inference/utils/mkldnn_blocklist.py +59 -0
  145. paddlex/inference/utils/official_models.py +140 -5
  146. paddlex/inference/utils/pp_option.py +74 -9
  147. paddlex/model.py +2 -2
  148. paddlex/modules/__init__.py +1 -1
  149. paddlex/modules/anomaly_detection/evaluator.py +2 -2
  150. paddlex/modules/base/__init__.py +1 -1
  151. paddlex/modules/base/evaluator.py +5 -5
  152. paddlex/modules/base/trainer.py +1 -1
  153. paddlex/modules/doc_vlm/dataset_checker.py +2 -2
  154. paddlex/modules/doc_vlm/evaluator.py +2 -2
  155. paddlex/modules/doc_vlm/exportor.py +2 -2
  156. paddlex/modules/doc_vlm/model_list.py +1 -1
  157. paddlex/modules/doc_vlm/trainer.py +2 -2
  158. paddlex/modules/face_recognition/evaluator.py +2 -2
  159. paddlex/modules/formula_recognition/evaluator.py +5 -2
  160. paddlex/modules/formula_recognition/model_list.py +3 -0
  161. paddlex/modules/formula_recognition/trainer.py +3 -0
  162. paddlex/modules/general_recognition/evaluator.py +1 -1
  163. paddlex/modules/image_classification/evaluator.py +2 -2
  164. paddlex/modules/image_classification/model_list.py +1 -0
  165. paddlex/modules/instance_segmentation/evaluator.py +1 -1
  166. paddlex/modules/keypoint_detection/evaluator.py +1 -1
  167. paddlex/modules/m_3d_bev_detection/evaluator.py +2 -2
  168. paddlex/modules/multilabel_classification/evaluator.py +2 -2
  169. paddlex/modules/object_detection/dataset_checker/dataset_src/convert_dataset.py +4 -4
  170. paddlex/modules/object_detection/evaluator.py +2 -2
  171. paddlex/modules/object_detection/model_list.py +2 -0
  172. paddlex/modules/semantic_segmentation/dataset_checker/__init__.py +12 -2
  173. paddlex/modules/semantic_segmentation/evaluator.py +2 -2
  174. paddlex/modules/table_recognition/evaluator.py +2 -2
  175. paddlex/modules/text_detection/evaluator.py +2 -2
  176. paddlex/modules/text_detection/model_list.py +2 -0
  177. paddlex/modules/text_recognition/evaluator.py +2 -2
  178. paddlex/modules/text_recognition/model_list.py +2 -0
  179. paddlex/modules/ts_anomaly_detection/evaluator.py +2 -2
  180. paddlex/modules/ts_classification/dataset_checker/dataset_src/split_dataset.py +1 -1
  181. paddlex/modules/ts_classification/evaluator.py +2 -2
  182. paddlex/modules/ts_forecast/evaluator.py +2 -2
  183. paddlex/modules/video_classification/evaluator.py +2 -2
  184. paddlex/modules/video_detection/evaluator.py +2 -2
  185. paddlex/ops/__init__.py +8 -5
  186. paddlex/paddlex_cli.py +19 -13
  187. paddlex/repo_apis/Paddle3D_api/bev_fusion/model.py +2 -2
  188. paddlex/repo_apis/PaddleClas_api/cls/config.py +1 -1
  189. paddlex/repo_apis/PaddleClas_api/cls/model.py +1 -1
  190. paddlex/repo_apis/PaddleClas_api/cls/register.py +10 -0
  191. paddlex/repo_apis/PaddleClas_api/cls/runner.py +1 -1
  192. paddlex/repo_apis/PaddleDetection_api/instance_seg/model.py +1 -1
  193. paddlex/repo_apis/PaddleDetection_api/instance_seg/runner.py +1 -1
  194. paddlex/repo_apis/PaddleDetection_api/object_det/config.py +1 -1
  195. paddlex/repo_apis/PaddleDetection_api/object_det/model.py +1 -1
  196. paddlex/repo_apis/PaddleDetection_api/object_det/official_categories.py +25 -0
  197. paddlex/repo_apis/PaddleDetection_api/object_det/register.py +30 -0
  198. paddlex/repo_apis/PaddleDetection_api/object_det/runner.py +1 -1
  199. paddlex/repo_apis/PaddleOCR_api/formula_rec/config.py +3 -3
  200. paddlex/repo_apis/PaddleOCR_api/formula_rec/model.py +5 -9
  201. paddlex/repo_apis/PaddleOCR_api/formula_rec/register.py +27 -0
  202. paddlex/repo_apis/PaddleOCR_api/formula_rec/runner.py +1 -1
  203. paddlex/repo_apis/PaddleOCR_api/table_rec/model.py +1 -1
  204. paddlex/repo_apis/PaddleOCR_api/table_rec/runner.py +1 -1
  205. paddlex/repo_apis/PaddleOCR_api/text_det/model.py +1 -1
  206. paddlex/repo_apis/PaddleOCR_api/text_det/register.py +18 -0
  207. paddlex/repo_apis/PaddleOCR_api/text_det/runner.py +1 -1
  208. paddlex/repo_apis/PaddleOCR_api/text_rec/config.py +3 -3
  209. paddlex/repo_apis/PaddleOCR_api/text_rec/model.py +5 -9
  210. paddlex/repo_apis/PaddleOCR_api/text_rec/register.py +18 -0
  211. paddlex/repo_apis/PaddleOCR_api/text_rec/runner.py +1 -1
  212. paddlex/repo_apis/PaddleSeg_api/seg/model.py +1 -1
  213. paddlex/repo_apis/PaddleSeg_api/seg/runner.py +1 -1
  214. paddlex/repo_apis/PaddleTS_api/ts_ad/config.py +3 -3
  215. paddlex/repo_apis/PaddleTS_api/ts_cls/config.py +2 -2
  216. paddlex/repo_apis/PaddleTS_api/ts_fc/config.py +4 -4
  217. paddlex/repo_apis/PaddleVideo_api/video_cls/config.py +1 -1
  218. paddlex/repo_apis/PaddleVideo_api/video_cls/model.py +1 -1
  219. paddlex/repo_apis/PaddleVideo_api/video_cls/runner.py +1 -1
  220. paddlex/repo_apis/PaddleVideo_api/video_det/config.py +1 -1
  221. paddlex/repo_apis/PaddleVideo_api/video_det/model.py +1 -1
  222. paddlex/repo_apis/PaddleVideo_api/video_det/runner.py +1 -1
  223. paddlex/repo_apis/base/config.py +1 -1
  224. paddlex/repo_manager/core.py +3 -3
  225. paddlex/repo_manager/meta.py +6 -2
  226. paddlex/repo_manager/repo.py +17 -16
  227. paddlex/utils/custom_device_list.py +26 -2
  228. paddlex/utils/deps.py +3 -3
  229. paddlex/utils/device.py +5 -13
  230. paddlex/utils/env.py +4 -0
  231. paddlex/utils/flags.py +11 -4
  232. paddlex/utils/fonts/__init__.py +34 -4
  233. paddlex/utils/misc.py +1 -1
  234. paddlex/utils/subclass_register.py +2 -2
  235. {paddlex-3.0.0rc1.dist-info → paddlex-3.0.2.dist-info}/METADATA +349 -208
  236. {paddlex-3.0.0rc1.dist-info → paddlex-3.0.2.dist-info}/RECORD +240 -211
  237. {paddlex-3.0.0rc1.dist-info → paddlex-3.0.2.dist-info}/WHEEL +1 -1
  238. {paddlex-3.0.0rc1.dist-info → paddlex-3.0.2.dist-info}/entry_points.txt +1 -0
  239. {paddlex-3.0.0rc1.dist-info/licenses → paddlex-3.0.2.dist-info}/LICENSE +0 -0
  240. {paddlex-3.0.0rc1.dist-info → paddlex-3.0.2.dist-info}/top_level.txt +0 -0
@@ -12,15 +12,9 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- import base64
16
- import math
17
- from io import BytesIO
18
- from typing import Dict, List, Optional, Tuple, Union
15
+ from typing import Dict, List, Optional, Union
19
16
 
20
17
  import numpy as np
21
- import paddle
22
- import requests
23
- from PIL import Image
24
18
 
25
19
  from .....utils import logging
26
20
  from ....utils.benchmark import benchmark
@@ -33,10 +27,12 @@ from .common import (
33
27
  TensorType,
34
28
  TextInput,
35
29
  convert_to_rgb,
30
+ fetch_image,
36
31
  get_image_size,
37
32
  infer_channel_dimension_format,
38
- is_valid_image,
33
+ make_batched_images,
39
34
  make_list_of_images,
35
+ smart_resize,
40
36
  to_channel_dimension_format,
41
37
  to_numpy_array,
42
38
  valid_images,
@@ -82,7 +78,7 @@ class Qwen2VLProcessor(object):
82
78
  self.image_processor.min_pixels = kwargs.get("min_pixels", 3136)
83
79
  self.image_processor.max_pixels = kwargs.get("max_pixels", 12845056)
84
80
 
85
- def _preprocess(
81
+ def preprocess(
86
82
  self,
87
83
  images: ImageInput = None,
88
84
  text: Union[TextInput, List[TextInput]] = None,
@@ -182,33 +178,6 @@ class Qwen2VLProcessor(object):
182
178
  return self.tokenizer.decode(*args, **kwargs)
183
179
 
184
180
 
185
- def make_batched_images(images) -> List[List[ImageInput]]:
186
- """
187
- Accepts images in list or nested list format, and makes a list of images for preprocessing.
188
-
189
- Args:
190
- images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`):
191
- The input image.
192
-
193
- Returns:
194
- list: A list of images.
195
- """
196
- if (
197
- isinstance(images, (list, tuple))
198
- and isinstance(images[0], (list, tuple))
199
- and is_valid_image(images[0][0])
200
- ):
201
- return [img for img_list in images for img in img_list]
202
-
203
- elif isinstance(images, (list, tuple)) and is_valid_image(images[0]):
204
- return images
205
-
206
- elif is_valid_image(images):
207
- return [images]
208
-
209
- raise ValueError(f"Could not make batched images from {images}")
210
-
211
-
212
181
  class Qwen2VLImageProcessor(object):
213
182
  r"""
214
183
  Constructs a Qwen2-VL image processor that dynamically resizes images based on the original images.
@@ -360,6 +329,7 @@ class Qwen2VLImageProcessor(object):
360
329
  factor=self.patch_size * self.merge_size,
361
330
  min_pixels=self.min_pixels,
362
331
  max_pixels=self.max_pixels,
332
+ max_ratio=MAX_RATIO,
363
333
  )
364
334
  image = image.astype("uint8")
365
335
  image = resize(
@@ -527,159 +497,34 @@ class Qwen2VLImageProcessor(object):
527
497
  return self.preprocess(images, **kwargs)
528
498
 
529
499
 
530
- def round_by_factor(number: int, factor: int) -> int:
531
- """Returns the closest integer to 'number' that is divisible by 'factor'."""
532
- return round(number / factor) * factor
533
-
534
-
535
- def ceil_by_factor(number: int, factor: int) -> int:
536
- """Returns the smallest integer greater than or equal to 'number' that is divisible by 'factor'."""
537
- return math.ceil(number / factor) * factor
538
-
539
-
540
- def floor_by_factor(number: int, factor: int) -> int:
541
- """Returns the largest integer less than or equal to 'number' that is divisible by 'factor'."""
542
- return math.floor(number / factor) * factor
543
-
544
-
545
- def smart_resize(
546
- height: int,
547
- width: int,
548
- factor: int = IMAGE_FACTOR,
549
- min_pixels: int = MIN_PIXELS,
550
- max_pixels: int = MAX_PIXELS,
551
- ) -> Tuple[int, int]:
552
- """
553
- Rescales the image so that the following conditions are met:
554
-
555
- 1. Both dimensions (height and width) are divisible by 'factor'.
556
-
557
- 2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
558
-
559
- 3. The aspect ratio of the image is maintained as closely as possible.
560
- """
561
- if max(height, width) / min(height, width) > MAX_RATIO:
562
- raise ValueError(
563
- f"absolute aspect ratio must be smaller than {MAX_RATIO}, got {max(height, width) / min(height, width)}"
564
- )
565
- h_bar = max(factor, round_by_factor(height, factor))
566
- w_bar = max(factor, round_by_factor(width, factor))
567
- if h_bar * w_bar > max_pixels:
568
- beta = math.sqrt((height * width) / max_pixels)
569
- h_bar = floor_by_factor(height / beta, factor)
570
- w_bar = floor_by_factor(width / beta, factor)
571
- elif h_bar * w_bar < min_pixels:
572
- beta = math.sqrt(min_pixels / (height * width))
573
- h_bar = ceil_by_factor(height * beta, factor)
574
- w_bar = ceil_by_factor(width * beta, factor)
575
- return h_bar, w_bar
576
-
577
-
578
- def fetch_image(
579
- ele: Dict[str, Union[str, Image.Image]], size_factor: int = IMAGE_FACTOR
580
- ) -> Image.Image:
581
- if not isinstance(ele, dict):
582
- ele = {"image": ele}
583
- if "image" in ele:
584
- image = ele["image"]
585
- else:
586
- image = ele["image_url"]
587
- image_obj = None
588
- if isinstance(image, Image.Image):
589
- image_obj = image
590
- elif isinstance(image, np.ndarray):
591
- image_obj = Image.fromarray(image)
592
- elif image.startswith("http://") or image.startswith("https://"):
593
- image_obj = Image.open(requests.get(image, stream=True).raw)
594
- elif image.startswith("file://"):
595
- image_obj = Image.open(image[7:])
596
- elif image.startswith("data:image"):
597
- data = image.split(";", 1)[1]
598
- if data.startswith("base64,"):
599
- data = base64.b64decode(data[7:])
600
- image_obj = Image.open(BytesIO(data))
601
- else:
602
- image_obj = Image.open(image)
603
- if image_obj is None:
604
- raise ValueError(
605
- f"Unrecognized image input, support local path, http url, base64 and PIL.Image, got {image}"
606
- )
607
- image = image_obj.convert("RGB")
608
- # resize
609
- if "resized_height" in ele and "resized_width" in ele:
610
- resized_height, resized_width = smart_resize(
611
- ele["resized_height"],
612
- ele["resized_width"],
613
- factor=size_factor,
614
- )
615
- else:
616
- width, height = image.size # Image, not tensor
617
- min_pixels = ele.get("min_pixels", MIN_PIXELS)
618
- max_pixels = ele.get("max_pixels", MAX_PIXELS)
619
- resized_height, resized_width = smart_resize(
620
- height,
621
- width,
622
- factor=size_factor,
623
- min_pixels=min_pixels,
624
- max_pixels=max_pixels,
625
- )
626
- image = image.resize((resized_width, resized_height))
627
-
628
- return image
629
-
630
-
631
- def extract_vision_info(
632
- conversations: Union[List[dict], List[List[dict]]]
633
- ) -> List[dict]:
634
- vision_infos = []
635
- if isinstance(conversations[0], dict):
636
- conversations = [conversations]
637
- for conversation in conversations:
638
- for message in conversation:
639
- if isinstance(message["content"], list):
640
- for ele in message["content"]:
641
- if (
642
- "image" in ele
643
- or "image_url" in ele
644
- or ele["type"] in ("image", "image_url")
645
- ):
646
- vision_infos.append(ele)
647
- return vision_infos
648
-
649
-
650
- def process_vision_info(
651
- conversations: Union[List[dict], List[List[dict]]],
652
- ) -> Tuple[
653
- Union[List[Image.Image], None, List[Union[paddle.Tensor, List[Image.Image]]], None]
654
- ]:
655
- vision_infos = extract_vision_info(conversations)
656
- image_inputs = []
657
- for vision_info in vision_infos:
658
- if "image" in vision_info or "image_url" in vision_info:
659
- image_inputs.append(fetch_image(vision_info))
660
- else:
661
- raise ValueError("image, image_url should in content.")
662
- if len(image_inputs) == 0:
663
- image_inputs = None
664
- return image_inputs
665
-
666
-
667
500
  class PPDocBeeProcessor(Qwen2VLProcessor):
668
501
  """
669
502
  PP-DocBee processor, based on Qwen2VLProcessor
670
503
  """
671
504
 
672
505
  @benchmark.timeit
673
- def preprocess(self, image: Union[str, Image.Image, np.ndarray], query: str):
506
+ def preprocess(self, input_dicts):
674
507
  """
675
508
  PreProcess for PP-DocBee Series
676
509
  """
677
- image_inputs = fetch_image(image)
510
+ assert (
511
+ isinstance(input_dicts, list) and len(input_dicts) == 1
512
+ ), f"PP-DocBee series only supports batchsize of one, but received {len(input_dicts)} samples."
513
+ input_dict = input_dicts[0]
514
+ image = input_dict["image"]
515
+ query = input_dict["query"]
516
+ image_inputs = fetch_image(
517
+ image,
518
+ size_factor=IMAGE_FACTOR,
519
+ min_pixels=MIN_PIXELS,
520
+ max_pixels=MAX_PIXELS,
521
+ max_ratio=MAX_RATIO,
522
+ )
678
523
  image_pad_token = "<|vision_start|><|image_pad|><|vision_end|>"
679
524
  text = f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{image_pad_token}{query}<|im_end|>\n<|im_start|>assistant\n"
680
525
  text = [text]
681
526
 
682
- rst_inputs = self._preprocess(
527
+ rst_inputs = super().preprocess(
683
528
  text=text,
684
529
  images=[image_inputs],
685
530
  padding=False,
@@ -97,7 +97,13 @@ class FormulaRecPredictor(BasePredictor):
97
97
  batch_imgs = self.pre_tfs["UniMERNetImgDecode"](imgs=batch_raw_imgs)
98
98
  batch_imgs = self.pre_tfs["UniMERNetTestTransform"](imgs=batch_imgs)
99
99
  batch_imgs = self.pre_tfs["UniMERNetImageFormat"](imgs=batch_imgs)
100
- elif self.model_name in ("PP-FormulaNet-S", "PP-FormulaNet-L"):
100
+ elif self.model_name in (
101
+ "PP-FormulaNet-S",
102
+ "PP-FormulaNet-L",
103
+ "PP-FormulaNet_plus-S",
104
+ "PP-FormulaNet_plus-M",
105
+ "PP-FormulaNet_plus-L",
106
+ ):
101
107
  batch_imgs = self.pre_tfs["UniMERNetImgDecode"](imgs=batch_raw_imgs)
102
108
  batch_imgs = self.pre_tfs["UniMERNetTestTransform"](imgs=batch_imgs)
103
109
  batch_imgs = self.pre_tfs["LatexImageFormat"](imgs=batch_imgs)
@@ -130,7 +136,7 @@ class FormulaRecPredictor(BasePredictor):
130
136
  }
131
137
 
132
138
  @register("DecodeImage")
133
- def build_readimg(self, channel_first, img_mode):
139
+ def build_readimg(self, channel_first, img_mode="RGB"):
134
140
  assert channel_first == False
135
141
  return "Read", ReadImage(format=img_mode)
136
142
 
@@ -15,9 +15,7 @@
15
15
 
16
16
  import json
17
17
  import math
18
- import os
19
18
  import re
20
- import tempfile
21
19
  from typing import Any, Dict, List, Optional, Tuple, Union
22
20
 
23
21
  import numpy as np
@@ -325,14 +323,9 @@ class LaTeXOCRDecode(object):
325
323
  **kwargs: Additional keyword arguments for initialization.
326
324
  """
327
325
  super(LaTeXOCRDecode, self).__init__()
328
- temp_path = tempfile.gettempdir()
329
- rec_char_dict_path = os.path.join(temp_path, "latexocr_tokenizer.json")
330
- try:
331
- with open(rec_char_dict_path, "w") as f:
332
- json.dump(character_list, f)
333
- except Exception as e:
334
- print(f"创建 latexocr_tokenizer.json 文件失败, 原因{str(e)}")
335
- self.tokenizer = TokenizerFast.from_file(rec_char_dict_path)
326
+ fast_tokenizer_str = json.dumps(character_list)
327
+ fast_tokenizer_buffer = fast_tokenizer_str.encode("utf-8")
328
+ self.tokenizer = TokenizerFast.from_buffer(fast_tokenizer_buffer)
336
329
 
337
330
  def post_process(self, s: str) -> str:
338
331
  """Post-processes the decoded LaTeX string.
@@ -631,74 +624,65 @@ class UniMERNetDecode(object):
631
624
  self.pad_token_type_id = 0
632
625
  self.pad_to_multiple_of = None
633
626
 
634
- temp_path = tempfile.gettempdir()
635
- fast_tokenizer_file = os.path.join(temp_path, "tokenizer.json")
636
- tokenizer_config_file = os.path.join(temp_path, "tokenizer_config.json")
637
- try:
638
- with open(fast_tokenizer_file, "w") as f:
639
- json.dump(character_list["fast_tokenizer_file"], f)
640
- with open(tokenizer_config_file, "w") as f:
641
- json.dump(character_list["tokenizer_config_file"], f)
642
- except Exception as e:
643
- print(
644
- f"创建 tokenizer.json 和 tokenizer_config.json 文件失败, 原因{str(e)}"
645
- )
646
-
647
- self.tokenizer = TokenizerFast.from_file(fast_tokenizer_file)
627
+ fast_tokenizer_str = json.dumps(character_list["fast_tokenizer_file"])
628
+ fast_tokenizer_buffer = fast_tokenizer_str.encode("utf-8")
629
+ self.tokenizer = TokenizerFast.from_buffer(fast_tokenizer_buffer)
630
+ tokenizer_config = (
631
+ character_list["tokenizer_config_file"]
632
+ if "tokenizer_config_file" in character_list
633
+ else None
634
+ )
648
635
  added_tokens_decoder = {}
649
636
  added_tokens_map = {}
650
- if tokenizer_config_file is not None:
651
- with open(
652
- tokenizer_config_file, encoding="utf-8"
653
- ) as tokenizer_config_handle:
654
- init_kwargs = json.load(tokenizer_config_handle)
655
- if "added_tokens_decoder" in init_kwargs:
656
- for idx, token in init_kwargs["added_tokens_decoder"].items():
657
- if isinstance(token, dict):
658
- token = AddedToken(**token)
659
- if isinstance(token, AddedToken):
660
- added_tokens_decoder[int(idx)] = token
661
- added_tokens_map[str(token)] = token
662
- else:
663
- raise ValueError(
664
- f"Found a {token.__class__} in the saved `added_tokens_decoder`, should be a dictionary or an AddedToken instance"
665
- )
666
- init_kwargs["added_tokens_decoder"] = added_tokens_decoder
667
- added_tokens_decoder = init_kwargs.pop("added_tokens_decoder", {})
668
- tokens_to_add = [
669
- token
670
- for index, token in sorted(
671
- added_tokens_decoder.items(), key=lambda x: x[0]
672
- )
673
- if token not in added_tokens_decoder
674
- ]
675
- added_tokens_encoder = self.added_tokens_encoder(added_tokens_decoder)
676
- encoder = list(added_tokens_encoder.keys()) + [
677
- str(token) for token in tokens_to_add
678
- ]
679
- tokens_to_add += [
680
- token
681
- for token in self.all_special_tokens_extended
682
- if token not in encoder and token not in tokens_to_add
683
- ]
684
- if len(tokens_to_add) > 0:
685
- is_last_special = None
686
- tokens = []
687
- special_tokens = self.all_special_tokens
688
- for token in tokens_to_add:
689
- is_special = (
690
- (token.special or str(token) in special_tokens)
691
- if isinstance(token, AddedToken)
692
- else str(token) in special_tokens
637
+ if tokenizer_config is not None:
638
+ init_kwargs = tokenizer_config
639
+ if "added_tokens_decoder" in init_kwargs:
640
+ for idx, token in init_kwargs["added_tokens_decoder"].items():
641
+ if isinstance(token, dict):
642
+ token = AddedToken(**token)
643
+ if isinstance(token, AddedToken):
644
+ added_tokens_decoder[int(idx)] = token
645
+ added_tokens_map[str(token)] = token
646
+ else:
647
+ raise ValueError(
648
+ f"Found a {token.__class__} in the saved `added_tokens_decoder`, should be a dictionary or an AddedToken instance"
693
649
  )
694
- if is_last_special is None or is_last_special == is_special:
695
- tokens.append(token)
696
- else:
697
- self._add_tokens(tokens, special_tokens=is_last_special)
698
- tokens = [token]
699
- is_last_special = is_special
700
- if tokens:
650
+ init_kwargs["added_tokens_decoder"] = added_tokens_decoder
651
+ added_tokens_decoder = init_kwargs.pop("added_tokens_decoder", {})
652
+ tokens_to_add = [
653
+ token
654
+ for index, token in sorted(
655
+ added_tokens_decoder.items(), key=lambda x: x[0]
656
+ )
657
+ if token not in added_tokens_decoder
658
+ ]
659
+ added_tokens_encoder = self.added_tokens_encoder(added_tokens_decoder)
660
+ encoder = list(added_tokens_encoder.keys()) + [
661
+ str(token) for token in tokens_to_add
662
+ ]
663
+ tokens_to_add += [
664
+ token
665
+ for token in self.all_special_tokens_extended
666
+ if token not in encoder and token not in tokens_to_add
667
+ ]
668
+ if len(tokens_to_add) > 0:
669
+ is_last_special = None
670
+ tokens = []
671
+ special_tokens = self.all_special_tokens
672
+ for token in tokens_to_add:
673
+ is_special = (
674
+ (token.special or str(token) in special_tokens)
675
+ if isinstance(token, AddedToken)
676
+ else str(token) in special_tokens
677
+ )
678
+ if is_last_special is None or is_last_special == is_special:
679
+ tokens.append(token)
680
+ else:
701
681
  self._add_tokens(tokens, special_tokens=is_last_special)
682
+ tokens = [token]
683
+ is_last_special = is_special
684
+ if tokens:
685
+ self._add_tokens(tokens, special_tokens=is_last_special)
702
686
 
703
687
  def _add_tokens(
704
688
  self, new_tokens: "List[Union[AddedToken, str]]", special_tokens: bool = False
@@ -858,8 +842,27 @@ class UniMERNetDecode(object):
858
842
  text_reg = r"(\\(operatorname|mathrm|text|mathbf)\s?\*? {.*?})"
859
843
  letter = "[a-zA-Z]"
860
844
  noletter = "[\W_^\d]"
861
- names = [x[0].replace(" ", "") for x in re.findall(text_reg, s)]
862
- s = re.sub(text_reg, lambda match: str(names.pop(0)), s)
845
+ names = []
846
+ for x in re.findall(text_reg, s):
847
+ pattern = r"\\[a-zA-Z]+"
848
+ pattern = r"(\\[a-zA-Z]+)\s(?=\w)|\\[a-zA-Z]+\s(?=})"
849
+ matches = re.findall(pattern, x[0])
850
+ for m in matches:
851
+ if (
852
+ m
853
+ not in [
854
+ "\\operatorname",
855
+ "\\mathrm",
856
+ "\\text",
857
+ "\\mathbf",
858
+ ]
859
+ and m.strip() != ""
860
+ ):
861
+ s = s.replace(m, m + "XXXXXXX")
862
+ s = s.replace(" ", "")
863
+ names.append(s)
864
+ if len(names) > 0:
865
+ s = re.sub(text_reg, lambda match: str(names.pop(0)), s)
863
866
  news = s
864
867
  while True:
865
868
  s = news
@@ -868,7 +871,16 @@ class UniMERNetDecode(object):
868
871
  news = re.sub(r"(%s)\s+?(%s)" % (letter, noletter), r"\1\2", news)
869
872
  if news == s:
870
873
  break
871
- return s
874
+ return s.replace("XXXXXXX", " ")
875
+
876
+ def remove_chinese_text_wrapping(self, formula):
877
+ pattern = re.compile(r"\\text\s*{\s*([^}]*?[\u4e00-\u9fff]+[^}]*?)\s*}")
878
+
879
+ def replacer(match):
880
+ return match.group(1)
881
+
882
+ replaced_formula = pattern.sub(replacer, formula)
883
+ return replaced_formula.replace('"', "")
872
884
 
873
885
  def post_process(self, text: str) -> str:
874
886
  """Post-processes a string by fixing text and normalizing it.
@@ -881,6 +893,7 @@ class UniMERNetDecode(object):
881
893
  """
882
894
  from ftfy import fix_text
883
895
 
896
+ text = self.remove_chinese_text_wrapping(text)
884
897
  text = fix_text(text)
885
898
  text = self.normalize(text)
886
899
  return text
@@ -15,9 +15,9 @@
15
15
  import copy
16
16
  import math
17
17
  import os
18
+ import re
18
19
  import subprocess
19
20
  import tempfile
20
- from pathlib import Path
21
21
  from typing import List, Optional
22
22
 
23
23
  import numpy as np
@@ -32,19 +32,11 @@ from ...common.result import BaseCVResult, JsonMixin
32
32
 
33
33
  if is_dep_available("opencv-contrib-python"):
34
34
  import cv2
35
- if is_dep_available("PyMuPDF"):
36
- import fitz
35
+ if is_dep_available("pypdfium2"):
36
+ import pypdfium2 as pdfium
37
37
 
38
38
 
39
39
  class FormulaRecResult(BaseCVResult):
40
- def _get_input_fn(self):
41
- fn = super()._get_input_fn()
42
- if (page_idx := self["page_index"]) is not None:
43
- fp = Path(fn)
44
- stem, suffix = fp.stem, fp.suffix
45
- return f"{stem}_{page_idx}{suffix}"
46
- else:
47
- return fn
48
40
 
49
41
  def _to_str(self, *args, **kwargs):
50
42
  data = copy.deepcopy(self)
@@ -126,6 +118,7 @@ def get_align_equation(equation: str) -> str:
126
118
  """
127
119
  is_align = False
128
120
  equation = str(equation) + "\n"
121
+
129
122
  begin_dict = [
130
123
  r"begin{align}",
131
124
  r"begin{align*}",
@@ -147,6 +140,17 @@ def get_align_equation(equation: str) -> str:
147
140
  return equation
148
141
 
149
142
 
143
+ def add_text_for_zh_formula(formula: str) -> str:
144
+ pattern = re.compile(r"([^\x00-\x7F]+)")
145
+
146
+ def replacer(match):
147
+ return f"\\text{{{match.group(1)}}}"
148
+
149
+ replaced_formula = pattern.sub(replacer, formula)
150
+
151
+ return replaced_formula
152
+
153
+
150
154
  def generate_tex_file(tex_file_path: str, equation: str) -> None:
151
155
  """
152
156
  Generates a LaTeX file containing a specific equation.
@@ -161,17 +165,19 @@ def generate_tex_file(tex_file_path: str, equation: str) -> None:
161
165
  """
162
166
  with custom_open(tex_file_path, "w") as fp:
163
167
  start_template = (
164
- r"\documentclass{article}" + "\n"
168
+ r"\documentclass[varwidth]{standalone}" + "\n"
165
169
  r"\usepackage{cite}" + "\n"
166
170
  r"\usepackage{amsmath,amssymb,amsfonts,upgreek}" + "\n"
167
171
  r"\usepackage{graphicx}" + "\n"
168
172
  r"\usepackage{textcomp}" + "\n"
173
+ r"\usepackage{xeCJK}" + "\n"
169
174
  r"\DeclareMathSizes{14}{14}{9.8}{7}" + "\n"
170
175
  r"\pagestyle{empty}" + "\n"
171
176
  r"\begin{document}" + "\n"
172
177
  r"\begin{large}" + "\n"
173
178
  )
174
179
  fp.write(start_template)
180
+ equation = add_text_for_zh_formula(equation)
175
181
  equation = get_align_equation(equation)
176
182
  fp.write(equation)
177
183
  end_template = r"\end{large}" + "\n" r"\end{document}" + "\n"
@@ -197,7 +203,7 @@ def generate_pdf_file(
197
203
  and None if an error occurred during the pdflatex execution.
198
204
  """
199
205
  if os.path.exists(tex_path):
200
- command = "pdflatex -interaction=nonstopmode -halt-on-error -output-directory={} {}".format(
206
+ command = "xelatex -interaction=nonstopmode -halt-on-error -output-directory={} {}".format(
201
207
  pdf_dir, tex_path
202
208
  )
203
209
  if is_debug:
@@ -236,7 +242,7 @@ def crop_white_area(image: np.ndarray) -> Optional[List[int]]:
236
242
  return None
237
243
 
238
244
 
239
- @function_requires_deps("PyMuPDF", "opencv-contrib-python")
245
+ @function_requires_deps("pypdfium2", "opencv-contrib-python")
240
246
  def pdf2img(pdf_path: str, img_path: str, is_padding: bool = False):
241
247
  """
242
248
  Converts a single-page PDF to an image, optionally cropping white areas and adding padding.
@@ -249,21 +255,16 @@ def pdf2img(pdf_path: str, img_path: str, is_padding: bool = False):
249
255
  Returns:
250
256
  np.ndarray: The resulting image as a NumPy array, or None if the PDF is not single-page.
251
257
  """
252
-
253
- pdfDoc = fitz.open(pdf_path)
254
- if pdfDoc.page_count != 1:
258
+ pdfDoc = pdfium.PdfDocument(pdf_path)
259
+ if len(pdfDoc) != 1:
255
260
  return None
256
- for pg in range(pdfDoc.page_count):
257
- page = pdfDoc[pg]
261
+ for page in pdfDoc:
258
262
  rotate = int(0)
259
- zoom_x = 2
260
- zoom_y = 2
261
- mat = fitz.Matrix(zoom_x, zoom_y).prerotate(rotate)
262
- pix = page.get_pixmap(matrix=mat, alpha=False)
263
- getpngdata = pix.tobytes(output="png")
264
- # decode as np.uint8
265
- image_array = np.frombuffer(getpngdata, dtype=np.uint8)
266
- img = cv2.imdecode(image_array, cv2.IMREAD_ANYCOLOR)
263
+ zoom = 2
264
+ img = page.render(scale=zoom, rotation=rotate).to_pil()
265
+ img = img.convert("RGB")
266
+ img = np.array(img)
267
+ img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
267
268
  xywh = crop_white_area(img)
268
269
 
269
270
  if xywh is not None:
@@ -23,10 +23,9 @@ class NormalizeFeatures:
23
23
 
24
24
  def _normalize(self, preds):
25
25
  """normalize"""
26
- feas_norm = np.sqrt(np.sum(np.square(preds[0]), axis=0, keepdims=True))
27
- features = np.divide(preds[0], feas_norm)
26
+ feas_norm = np.sqrt(np.sum(np.square(preds), axis=1, keepdims=True))
27
+ features = np.divide(preds, feas_norm)
28
28
  return features
29
29
 
30
30
  def __call__(self, preds):
31
- normalized_features = [self._normalize(feature) for feature in preds]
32
- return normalized_features
31
+ return self._normalize(preds[0])
@@ -26,6 +26,9 @@ from .result import KptResult
26
26
 
27
27
 
28
28
  class KptBatchSampler(ImageBatchSampler):
29
+ # don't support to pass pdf file as input
30
+ PDF_SUFFIX = []
31
+
29
32
  def sample(self, inputs):
30
33
  if not isinstance(inputs, list):
31
34
  inputs = [inputs]
@@ -316,6 +316,8 @@ class DetPredictor(BasePredictor):
316
316
  "BlazeFace",
317
317
  "BlazeFace-FPN-SSH",
318
318
  "PP-DocLayout-L",
319
+ "PP-DocLayout_plus-L",
320
+ "PP-DocBlockLayout",
319
321
  ]
320
322
  if any(name in self.model_name for name in models_required_imgsize):
321
323
  ordered_required_keys = (