paddlex 3.0.0rc1__py3-none-any.whl → 3.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (233) hide show
  1. paddlex/.version +1 -1
  2. paddlex/__init__.py +1 -1
  3. paddlex/configs/modules/chart_parsing/PP-Chart2Table.yaml +13 -0
  4. paddlex/configs/modules/doc_vlm/PP-DocBee2-3B.yaml +14 -0
  5. paddlex/configs/modules/formula_recognition/PP-FormulaNet_plus-L.yaml +40 -0
  6. paddlex/configs/modules/formula_recognition/PP-FormulaNet_plus-M.yaml +40 -0
  7. paddlex/configs/modules/formula_recognition/PP-FormulaNet_plus-S.yaml +40 -0
  8. paddlex/configs/modules/layout_detection/PP-DocBlockLayout.yaml +40 -0
  9. paddlex/configs/modules/layout_detection/PP-DocLayout-L.yaml +2 -2
  10. paddlex/configs/modules/layout_detection/PP-DocLayout-M.yaml +2 -2
  11. paddlex/configs/modules/layout_detection/PP-DocLayout-S.yaml +2 -2
  12. paddlex/configs/modules/layout_detection/PP-DocLayout_plus-L.yaml +40 -0
  13. paddlex/configs/modules/text_detection/PP-OCRv5_mobile_det.yaml +40 -0
  14. paddlex/configs/modules/text_detection/PP-OCRv5_server_det.yaml +40 -0
  15. paddlex/configs/modules/text_recognition/PP-OCRv5_mobile_rec.yaml +39 -0
  16. paddlex/configs/modules/text_recognition/PP-OCRv5_server_rec.yaml +39 -0
  17. paddlex/configs/modules/textline_orientation/PP-LCNet_x1_0_textline_ori.yaml +41 -0
  18. paddlex/configs/pipelines/OCR.yaml +7 -6
  19. paddlex/configs/pipelines/PP-ChatOCRv3-doc.yaml +3 -1
  20. paddlex/configs/pipelines/PP-ChatOCRv4-doc.yaml +91 -34
  21. paddlex/configs/pipelines/PP-StructureV3.yaml +72 -72
  22. paddlex/configs/pipelines/doc_understanding.yaml +1 -1
  23. paddlex/configs/pipelines/formula_recognition.yaml +2 -2
  24. paddlex/configs/pipelines/layout_parsing.yaml +3 -2
  25. paddlex/configs/pipelines/seal_recognition.yaml +1 -0
  26. paddlex/configs/pipelines/table_recognition.yaml +2 -1
  27. paddlex/configs/pipelines/table_recognition_v2.yaml +7 -1
  28. paddlex/hpip_links.html +20 -20
  29. paddlex/inference/common/batch_sampler/doc_vlm_batch_sampler.py +33 -10
  30. paddlex/inference/common/batch_sampler/image_batch_sampler.py +34 -25
  31. paddlex/inference/common/result/mixin.py +19 -12
  32. paddlex/inference/models/base/predictor/base_predictor.py +2 -8
  33. paddlex/inference/models/common/static_infer.py +11 -59
  34. paddlex/inference/models/common/tokenizer/__init__.py +2 -0
  35. paddlex/inference/models/common/tokenizer/clip_tokenizer.py +1 -1
  36. paddlex/inference/models/common/tokenizer/gpt_tokenizer.py +2 -2
  37. paddlex/inference/models/common/tokenizer/qwen2_5_tokenizer.py +112 -0
  38. paddlex/inference/models/common/tokenizer/qwen2_tokenizer.py +7 -1
  39. paddlex/inference/models/common/tokenizer/qwen_tokenizer.py +288 -0
  40. paddlex/inference/models/common/tokenizer/tokenizer_utils.py +13 -13
  41. paddlex/inference/models/common/tokenizer/tokenizer_utils_base.py +3 -3
  42. paddlex/inference/models/common/tokenizer/vocab.py +7 -7
  43. paddlex/inference/models/common/vlm/conversion_utils.py +99 -0
  44. paddlex/inference/models/common/vlm/fusion_ops.py +205 -0
  45. paddlex/inference/models/common/vlm/generation/configuration_utils.py +1 -1
  46. paddlex/inference/models/common/vlm/generation/logits_process.py +1 -1
  47. paddlex/inference/models/common/vlm/generation/utils.py +1 -1
  48. paddlex/inference/models/common/vlm/transformers/configuration_utils.py +3 -3
  49. paddlex/inference/models/common/vlm/transformers/conversion_utils.py +3 -3
  50. paddlex/inference/models/common/vlm/transformers/model_outputs.py +2 -2
  51. paddlex/inference/models/common/vlm/transformers/model_utils.py +7 -31
  52. paddlex/inference/models/doc_vlm/modeling/GOT_ocr_2_0.py +830 -0
  53. paddlex/inference/models/doc_vlm/modeling/__init__.py +2 -0
  54. paddlex/inference/models/doc_vlm/modeling/qwen2.py +1606 -0
  55. paddlex/inference/models/doc_vlm/modeling/qwen2_5_vl.py +3006 -0
  56. paddlex/inference/models/doc_vlm/modeling/qwen2_vl.py +0 -105
  57. paddlex/inference/models/doc_vlm/predictor.py +79 -24
  58. paddlex/inference/models/doc_vlm/processors/GOT_ocr_2_0.py +97 -0
  59. paddlex/inference/models/doc_vlm/processors/__init__.py +2 -0
  60. paddlex/inference/models/doc_vlm/processors/common.py +189 -0
  61. paddlex/inference/models/doc_vlm/processors/qwen2_5_vl.py +548 -0
  62. paddlex/inference/models/doc_vlm/processors/qwen2_vl.py +21 -176
  63. paddlex/inference/models/formula_recognition/predictor.py +7 -1
  64. paddlex/inference/models/formula_recognition/processors.py +92 -79
  65. paddlex/inference/models/formula_recognition/result.py +28 -27
  66. paddlex/inference/models/image_feature/processors.py +3 -4
  67. paddlex/inference/models/keypoint_detection/predictor.py +3 -0
  68. paddlex/inference/models/object_detection/predictor.py +2 -0
  69. paddlex/inference/models/object_detection/processors.py +28 -3
  70. paddlex/inference/models/object_detection/utils.py +2 -0
  71. paddlex/inference/models/table_structure_recognition/result.py +0 -10
  72. paddlex/inference/models/text_detection/predictor.py +8 -0
  73. paddlex/inference/models/text_detection/processors.py +44 -10
  74. paddlex/inference/models/text_detection/result.py +0 -10
  75. paddlex/inference/pipelines/__init__.py +9 -5
  76. paddlex/inference/pipelines/_parallel.py +172 -0
  77. paddlex/inference/pipelines/anomaly_detection/pipeline.py +16 -6
  78. paddlex/inference/pipelines/attribute_recognition/pipeline.py +11 -1
  79. paddlex/inference/pipelines/base.py +14 -4
  80. paddlex/inference/pipelines/components/faisser.py +1 -1
  81. paddlex/inference/pipelines/doc_preprocessor/pipeline.py +53 -27
  82. paddlex/inference/pipelines/formula_recognition/pipeline.py +120 -82
  83. paddlex/inference/pipelines/formula_recognition/result.py +1 -11
  84. paddlex/inference/pipelines/image_classification/pipeline.py +16 -6
  85. paddlex/inference/pipelines/image_multilabel_classification/pipeline.py +16 -6
  86. paddlex/inference/pipelines/instance_segmentation/pipeline.py +16 -6
  87. paddlex/inference/pipelines/keypoint_detection/pipeline.py +16 -6
  88. paddlex/inference/pipelines/layout_parsing/pipeline.py +34 -47
  89. paddlex/inference/pipelines/layout_parsing/pipeline_v2.py +893 -260
  90. paddlex/inference/pipelines/layout_parsing/result.py +4 -17
  91. paddlex/inference/pipelines/layout_parsing/result_v2.py +523 -245
  92. paddlex/inference/pipelines/layout_parsing/setting.py +87 -0
  93. paddlex/inference/pipelines/layout_parsing/utils.py +565 -1998
  94. paddlex/inference/pipelines/layout_parsing/xycut_enhanced/__init__.py +16 -0
  95. paddlex/inference/pipelines/layout_parsing/xycut_enhanced/utils.py +1144 -0
  96. paddlex/inference/pipelines/layout_parsing/xycut_enhanced/xycuts.py +563 -0
  97. paddlex/inference/pipelines/m_3d_bev_detection/pipeline.py +2 -2
  98. paddlex/inference/pipelines/multilingual_speech_recognition/pipeline.py +2 -2
  99. paddlex/inference/pipelines/object_detection/pipeline.py +16 -6
  100. paddlex/inference/pipelines/ocr/pipeline.py +127 -70
  101. paddlex/inference/pipelines/ocr/result.py +19 -16
  102. paddlex/inference/pipelines/open_vocabulary_detection/pipeline.py +2 -2
  103. paddlex/inference/pipelines/open_vocabulary_segmentation/pipeline.py +2 -2
  104. paddlex/inference/pipelines/pp_chatocr/pipeline_base.py +2 -2
  105. paddlex/inference/pipelines/pp_chatocr/pipeline_v3.py +2 -5
  106. paddlex/inference/pipelines/pp_chatocr/pipeline_v4.py +5 -5
  107. paddlex/inference/pipelines/rotated_object_detection/pipeline.py +16 -6
  108. paddlex/inference/pipelines/seal_recognition/pipeline.py +109 -53
  109. paddlex/inference/pipelines/semantic_segmentation/pipeline.py +16 -6
  110. paddlex/inference/pipelines/small_object_detection/pipeline.py +16 -6
  111. paddlex/inference/pipelines/table_recognition/pipeline.py +26 -18
  112. paddlex/inference/pipelines/table_recognition/pipeline_v2.py +624 -53
  113. paddlex/inference/pipelines/table_recognition/result.py +1 -1
  114. paddlex/inference/pipelines/table_recognition/table_recognition_post_processing_v2.py +9 -5
  115. paddlex/inference/pipelines/ts_anomaly_detection/pipeline.py +2 -2
  116. paddlex/inference/pipelines/ts_classification/pipeline.py +2 -2
  117. paddlex/inference/pipelines/ts_forecasting/pipeline.py +2 -2
  118. paddlex/inference/pipelines/video_classification/pipeline.py +2 -2
  119. paddlex/inference/pipelines/video_detection/pipeline.py +2 -2
  120. paddlex/inference/serving/basic_serving/_pipeline_apps/_common/common.py +5 -1
  121. paddlex/inference/serving/basic_serving/_pipeline_apps/layout_parsing.py +0 -1
  122. paddlex/inference/serving/basic_serving/_pipeline_apps/pp_chatocrv3_doc.py +0 -1
  123. paddlex/inference/serving/basic_serving/_pipeline_apps/pp_chatocrv4_doc.py +1 -1
  124. paddlex/inference/serving/basic_serving/_pipeline_apps/pp_structurev3.py +6 -2
  125. paddlex/inference/serving/basic_serving/_pipeline_apps/table_recognition.py +1 -5
  126. paddlex/inference/serving/basic_serving/_pipeline_apps/table_recognition_v2.py +4 -5
  127. paddlex/inference/serving/infra/utils.py +20 -22
  128. paddlex/inference/serving/schemas/formula_recognition.py +1 -1
  129. paddlex/inference/serving/schemas/layout_parsing.py +1 -2
  130. paddlex/inference/serving/schemas/pp_chatocrv3_doc.py +1 -2
  131. paddlex/inference/serving/schemas/pp_chatocrv4_doc.py +2 -2
  132. paddlex/inference/serving/schemas/pp_structurev3.py +10 -6
  133. paddlex/inference/serving/schemas/seal_recognition.py +1 -1
  134. paddlex/inference/serving/schemas/table_recognition.py +2 -6
  135. paddlex/inference/serving/schemas/table_recognition_v2.py +5 -6
  136. paddlex/inference/utils/hpi.py +8 -1
  137. paddlex/inference/utils/hpi_model_info_collection.json +81 -2
  138. paddlex/inference/utils/io/readers.py +12 -12
  139. paddlex/inference/utils/mkldnn_blocklist.py +25 -0
  140. paddlex/inference/utils/official_models.py +14 -0
  141. paddlex/inference/utils/pp_option.py +29 -8
  142. paddlex/model.py +2 -2
  143. paddlex/modules/__init__.py +1 -1
  144. paddlex/modules/anomaly_detection/evaluator.py +2 -2
  145. paddlex/modules/base/__init__.py +1 -1
  146. paddlex/modules/base/evaluator.py +5 -5
  147. paddlex/modules/base/trainer.py +1 -1
  148. paddlex/modules/doc_vlm/dataset_checker.py +2 -2
  149. paddlex/modules/doc_vlm/evaluator.py +2 -2
  150. paddlex/modules/doc_vlm/exportor.py +2 -2
  151. paddlex/modules/doc_vlm/model_list.py +1 -1
  152. paddlex/modules/doc_vlm/trainer.py +2 -2
  153. paddlex/modules/face_recognition/evaluator.py +2 -2
  154. paddlex/modules/formula_recognition/evaluator.py +5 -2
  155. paddlex/modules/formula_recognition/model_list.py +3 -0
  156. paddlex/modules/formula_recognition/trainer.py +3 -0
  157. paddlex/modules/general_recognition/evaluator.py +1 -1
  158. paddlex/modules/image_classification/evaluator.py +2 -2
  159. paddlex/modules/image_classification/model_list.py +1 -0
  160. paddlex/modules/instance_segmentation/evaluator.py +1 -1
  161. paddlex/modules/keypoint_detection/evaluator.py +1 -1
  162. paddlex/modules/m_3d_bev_detection/evaluator.py +2 -2
  163. paddlex/modules/multilabel_classification/evaluator.py +2 -2
  164. paddlex/modules/object_detection/dataset_checker/dataset_src/convert_dataset.py +4 -4
  165. paddlex/modules/object_detection/evaluator.py +2 -2
  166. paddlex/modules/object_detection/model_list.py +2 -0
  167. paddlex/modules/semantic_segmentation/evaluator.py +2 -2
  168. paddlex/modules/table_recognition/evaluator.py +2 -2
  169. paddlex/modules/text_detection/evaluator.py +2 -2
  170. paddlex/modules/text_detection/model_list.py +2 -0
  171. paddlex/modules/text_recognition/evaluator.py +2 -2
  172. paddlex/modules/text_recognition/model_list.py +2 -0
  173. paddlex/modules/ts_anomaly_detection/evaluator.py +2 -2
  174. paddlex/modules/ts_classification/dataset_checker/dataset_src/split_dataset.py +1 -1
  175. paddlex/modules/ts_classification/evaluator.py +2 -2
  176. paddlex/modules/ts_forecast/evaluator.py +2 -2
  177. paddlex/modules/video_classification/evaluator.py +2 -2
  178. paddlex/modules/video_detection/evaluator.py +2 -2
  179. paddlex/ops/__init__.py +2 -2
  180. paddlex/paddlex_cli.py +19 -13
  181. paddlex/repo_apis/Paddle3D_api/bev_fusion/model.py +2 -2
  182. paddlex/repo_apis/PaddleClas_api/cls/config.py +1 -1
  183. paddlex/repo_apis/PaddleClas_api/cls/model.py +1 -1
  184. paddlex/repo_apis/PaddleClas_api/cls/register.py +10 -0
  185. paddlex/repo_apis/PaddleClas_api/cls/runner.py +1 -1
  186. paddlex/repo_apis/PaddleDetection_api/instance_seg/model.py +1 -1
  187. paddlex/repo_apis/PaddleDetection_api/instance_seg/runner.py +1 -1
  188. paddlex/repo_apis/PaddleDetection_api/object_det/config.py +1 -1
  189. paddlex/repo_apis/PaddleDetection_api/object_det/model.py +1 -1
  190. paddlex/repo_apis/PaddleDetection_api/object_det/official_categories.py +25 -0
  191. paddlex/repo_apis/PaddleDetection_api/object_det/register.py +30 -0
  192. paddlex/repo_apis/PaddleDetection_api/object_det/runner.py +1 -1
  193. paddlex/repo_apis/PaddleOCR_api/formula_rec/config.py +3 -3
  194. paddlex/repo_apis/PaddleOCR_api/formula_rec/model.py +5 -9
  195. paddlex/repo_apis/PaddleOCR_api/formula_rec/register.py +27 -0
  196. paddlex/repo_apis/PaddleOCR_api/formula_rec/runner.py +1 -1
  197. paddlex/repo_apis/PaddleOCR_api/table_rec/model.py +1 -1
  198. paddlex/repo_apis/PaddleOCR_api/table_rec/runner.py +1 -1
  199. paddlex/repo_apis/PaddleOCR_api/text_det/model.py +1 -1
  200. paddlex/repo_apis/PaddleOCR_api/text_det/register.py +18 -0
  201. paddlex/repo_apis/PaddleOCR_api/text_det/runner.py +1 -1
  202. paddlex/repo_apis/PaddleOCR_api/text_rec/config.py +3 -3
  203. paddlex/repo_apis/PaddleOCR_api/text_rec/model.py +5 -9
  204. paddlex/repo_apis/PaddleOCR_api/text_rec/register.py +18 -0
  205. paddlex/repo_apis/PaddleOCR_api/text_rec/runner.py +1 -1
  206. paddlex/repo_apis/PaddleSeg_api/seg/model.py +1 -1
  207. paddlex/repo_apis/PaddleSeg_api/seg/runner.py +1 -1
  208. paddlex/repo_apis/PaddleTS_api/ts_ad/config.py +3 -3
  209. paddlex/repo_apis/PaddleTS_api/ts_cls/config.py +2 -2
  210. paddlex/repo_apis/PaddleTS_api/ts_fc/config.py +4 -4
  211. paddlex/repo_apis/PaddleVideo_api/video_cls/config.py +1 -1
  212. paddlex/repo_apis/PaddleVideo_api/video_cls/model.py +1 -1
  213. paddlex/repo_apis/PaddleVideo_api/video_cls/runner.py +1 -1
  214. paddlex/repo_apis/PaddleVideo_api/video_det/config.py +1 -1
  215. paddlex/repo_apis/PaddleVideo_api/video_det/model.py +1 -1
  216. paddlex/repo_apis/PaddleVideo_api/video_det/runner.py +1 -1
  217. paddlex/repo_apis/base/config.py +1 -1
  218. paddlex/repo_manager/core.py +3 -3
  219. paddlex/repo_manager/meta.py +6 -2
  220. paddlex/repo_manager/repo.py +17 -16
  221. paddlex/utils/custom_device_list.py +26 -2
  222. paddlex/utils/deps.py +1 -1
  223. paddlex/utils/device.py +15 -8
  224. paddlex/utils/env.py +4 -0
  225. paddlex/utils/flags.py +2 -4
  226. paddlex/utils/fonts/__init__.py +34 -4
  227. paddlex/utils/misc.py +1 -1
  228. {paddlex-3.0.0rc1.dist-info → paddlex-3.0.1.dist-info}/METADATA +52 -56
  229. {paddlex-3.0.0rc1.dist-info → paddlex-3.0.1.dist-info}/RECORD +233 -206
  230. {paddlex-3.0.0rc1.dist-info → paddlex-3.0.1.dist-info}/WHEEL +1 -1
  231. {paddlex-3.0.0rc1.dist-info → paddlex-3.0.1.dist-info}/entry_points.txt +0 -0
  232. {paddlex-3.0.0rc1.dist-info → paddlex-3.0.1.dist-info}/licenses/LICENSE +0 -0
  233. {paddlex-3.0.0rc1.dist-info → paddlex-3.0.1.dist-info}/top_level.txt +0 -0
@@ -15,7 +15,6 @@
15
15
  import math
16
16
  import os
17
17
  from dataclasses import dataclass
18
- from functools import partial
19
18
  from typing import Any, Dict, List, Optional, Tuple, Union
20
19
 
21
20
  import paddle
@@ -1983,74 +1982,6 @@ class Qwen2VLForConditionalGeneration(Qwen2VLPreTrainedModel):
1983
1982
  def get_decoder(self):
1984
1983
  return self.model
1985
1984
 
1986
- @classmethod
1987
- def _get_tensor_parallel_mappings(cls, config: Qwen2VLConfig, is_split=True):
1988
-
1989
- logging.info("Qwen2 inference model _get_tensor_parallel_mappings")
1990
-
1991
- from paddlenlp.transformers.conversion_utils import split_or_merge_func
1992
-
1993
- fn = split_or_merge_func(
1994
- is_split=is_split,
1995
- tensor_parallel_degree=config.tensor_parallel_degree,
1996
- tensor_parallel_rank=config.tensor_parallel_rank,
1997
- num_attention_heads=config.num_attention_heads,
1998
- )
1999
-
2000
- def get_tensor_parallel_split_mappings(num_layers):
2001
- final_actions = {}
2002
-
2003
- base_actions = {
2004
- "lm_head.weight": partial(fn, is_column=True),
2005
- # Row Linear
2006
- "embed_tokens.weight": partial(fn, is_column=False),
2007
- "layers.0.self_attn.o_proj.weight": partial(fn, is_column=False),
2008
- "layers.0.mlp.down_proj.weight": partial(fn, is_column=False),
2009
- }
2010
-
2011
- base_actions["layers.0.self_attn.q_proj.weight"] = partial(
2012
- fn, is_column=True
2013
- )
2014
- base_actions["layers.0.self_attn.q_proj.bias"] = partial(fn, is_column=True)
2015
- # if we have enough num_key_value_heads to split, then split it.
2016
- if config.num_key_value_heads % config.tensor_parallel_degree == 0:
2017
- base_actions["layers.0.self_attn.k_proj.weight"] = partial(
2018
- fn, is_column=True
2019
- )
2020
- base_actions["layers.0.self_attn.v_proj.weight"] = partial(
2021
- fn, is_column=True
2022
- )
2023
- base_actions["layers.0.self_attn.k_proj.bias"] = partial(
2024
- fn, is_column=True
2025
- )
2026
- base_actions["layers.0.self_attn.v_proj.bias"] = partial(
2027
- fn, is_column=True
2028
- )
2029
-
2030
- if config.fuse_attention_ffn:
2031
- base_actions["layers.0.mlp.gate_up_fused_proj.weight"] = partial(
2032
- fn, is_column=True, is_naive_2fuse=True
2033
- )
2034
- else:
2035
- base_actions["layers.0.mlp.gate_proj.weight"] = partial(
2036
- fn, is_column=True
2037
- )
2038
- base_actions["layers.0.mlp.up_proj.weight"] = partial(
2039
- fn, is_column=True
2040
- )
2041
-
2042
- for key, action in base_actions.items():
2043
- if "layers.0." in key:
2044
- for i in range(num_layers):
2045
- final_actions[key.replace("layers.0.", f"layers.{i}.")] = action
2046
- final_actions[key] = action
2047
-
2048
- return final_actions
2049
-
2050
- mappings = get_tensor_parallel_split_mappings(config.num_hidden_layers)
2051
-
2052
- return mappings
2053
-
2054
1985
  @staticmethod
2055
1986
  def get_rope_index(
2056
1987
  spatial_merge_size,
@@ -2276,42 +2207,6 @@ class Qwen2VLForConditionalGeneration(Qwen2VLPreTrainedModel):
2276
2207
 
2277
2208
  return model_kwargs
2278
2209
 
2279
- def vision_forward(
2280
- self,
2281
- input_ids: paddle.Tensor,
2282
- inputs_embeds: Optional[paddle.Tensor] = None,
2283
- attention_mask: Optional[paddle.Tensor] = None,
2284
- position_ids: Optional[paddle.Tensor] = None,
2285
- pixel_values: Optional[paddle.Tensor] = None,
2286
- pixel_values_videos: Optional[paddle.Tensor] = None,
2287
- image_grid_thw: Optional[paddle.Tensor] = None,
2288
- video_grid_thw: Optional[paddle.Tensor] = None,
2289
- rope_deltas: Optional[paddle.Tensor] = None,
2290
- ):
2291
-
2292
- if inputs_embeds is None:
2293
- from paddlenlp.experimental.transformers.qwen2.modeling import (
2294
- Qwen2VLForConditionalGenerationBlockInferenceModel,
2295
- )
2296
-
2297
- assert isinstance(
2298
- self.model, Qwen2VLForConditionalGenerationBlockInferenceModel
2299
- ), "model is not an instance of Qwen2VLForConditionalGenerationBlockInferenceModel"
2300
-
2301
- inputs_embeds = self.model.qwen2.embed_tokens(input_ids)
2302
- if pixel_values is not None:
2303
- pixel_values = paddle.cast(pixel_values, paddle.bfloat16)
2304
- image_embeds = self.visual(pixel_values, grid_thw=image_grid_thw)
2305
- image_mask = input_ids == self.config.image_token_id
2306
-
2307
- inputs_embeds[image_mask] = image_embeds
2308
- if pixel_values_videos is not None:
2309
- pixel_values_videos = paddle.cast(pixel_values_videos, paddle.bfloat16)
2310
- video_embeds = self.visual(pixel_values_videos, grid_thw=video_grid_thw)
2311
- video_mask = input_ids == self.config.video_token_id
2312
- inputs_embeds[video_mask] = video_embeds
2313
- return inputs_embeds
2314
-
2315
2210
  def forward(
2316
2211
  self,
2317
2212
  input_ids: paddle.Tensor = None,
@@ -14,6 +14,7 @@
14
14
 
15
15
  import copy
16
16
  import os
17
+ import warnings
17
18
  from typing import List
18
19
 
19
20
  from ....modules.doc_vlm.model_list import MODELS
@@ -27,6 +28,11 @@ from .result import DocVLMResult
27
28
  class DocVLMPredictor(BasePredictor):
28
29
 
29
30
  entities = MODELS
31
+ model_group = {
32
+ "PP-DocBee": {"PP-DocBee-2B", "PP-DocBee-7B"},
33
+ "PP-DocBee2": {"PP-DocBee2-3B"},
34
+ "PP-Chart2Table": {"PP-Chart2Table"},
35
+ }
30
36
 
31
37
  def __init__(self, *args, **kwargs):
32
38
  """Initializes DocVLMPredictor.
@@ -34,8 +40,17 @@ class DocVLMPredictor(BasePredictor):
34
40
  *args: Arbitrary positional arguments passed to the superclass.
35
41
  **kwargs: Arbitrary keyword arguments passed to the superclass.
36
42
  """
43
+ import paddle
44
+
37
45
  super().__init__(*args, **kwargs)
38
46
  self.device = kwargs.get("device", None)
47
+ self.dtype = (
48
+ "bfloat16"
49
+ if ("npu" in get_device_type() or paddle.amp.is_bfloat16_supported())
50
+ and (self.device is None or "cpu" not in self.device)
51
+ else "float32"
52
+ )
53
+
39
54
  self.infer, self.processor = self._build(**kwargs)
40
55
 
41
56
  def _build_batch_sampler(self):
@@ -44,7 +59,7 @@ class DocVLMPredictor(BasePredictor):
44
59
  Returns:
45
60
  DocVLMBatchSampler: An instance of DocVLMBatchSampler.
46
61
  """
47
- return DocVLMBatchSampler()
62
+ return DocVLMBatchSampler(self.model_name)
48
63
 
49
64
  def _get_result_class(self):
50
65
  """Returns the result class, DocVLMResult.
@@ -61,28 +76,49 @@ class DocVLMPredictor(BasePredictor):
61
76
  model: An instance of Paddle model, could be either a dynamic model or a static model.
62
77
  processor: The correspounding processor for the model.
63
78
  """
64
- import paddle
79
+ from .modeling import (
80
+ PPChart2TableInference,
81
+ PPDocBee2Inference,
82
+ PPDocBeeInference,
83
+ )
65
84
 
66
- from .modeling import PPDocBeeInference
85
+ # build processor
86
+ processor = self.build_processor()
67
87
 
68
88
  # build model
69
- if "PP-DocBee" in self.model_name:
89
+ if self.model_name in self.model_group["PP-DocBee"]:
70
90
  if kwargs.get("use_hpip", False):
71
- raise ValueError(
72
- f"PP-DocBee series do not support `use_hpip=True` for now."
91
+ warnings.warn(
92
+ "The PP-DocBee series does not support `use_hpip=True` for now."
93
+ )
94
+ with TemporaryDeviceChanger(self.device):
95
+ model = PPDocBeeInference.from_pretrained(
96
+ self.model_dir, dtype=self.dtype
97
+ )
98
+ elif self.model_name in self.model_group["PP-Chart2Table"]:
99
+ if kwargs.get("use_hpip", False):
100
+ warnings.warn(
101
+ "The PP-Chart2Table series does not support `use_hpip=True` for now."
73
102
  )
74
- dtype = (
75
- "bfloat16"
76
- if ("npu" in get_device_type() or paddle.amp.is_bfloat16_supported())
77
- else "float32"
78
- )
79
103
  with TemporaryDeviceChanger(self.device):
80
- model = PPDocBeeInference.from_pretrained(self.model_dir, dtype=dtype)
104
+ model = PPChart2TableInference.from_pretrained(
105
+ self.model_dir,
106
+ dtype=self.dtype,
107
+ pad_token_id=processor.tokenizer.eos_token_id,
108
+ )
109
+ elif self.model_name in self.model_group["PP-DocBee2"]:
110
+ if kwargs.get("use_hpip", False):
111
+ warnings.warn(
112
+ "The PP-Chart2Table series does not support `use_hpip=True` for now."
113
+ )
114
+ with TemporaryDeviceChanger(self.device):
115
+ model = PPDocBee2Inference.from_pretrained(
116
+ self.model_dir,
117
+ dtype=self.dtype,
118
+ )
81
119
  else:
82
120
  raise NotImplementedError(f"Model {self.model_name} is not supported.")
83
121
 
84
- # build processor
85
- processor = self.build_processor()
86
122
  return model, processor
87
123
 
88
124
  def process(self, data: List[dict], **kwargs):
@@ -96,15 +132,11 @@ class DocVLMPredictor(BasePredictor):
96
132
  Returns:
97
133
  dict: A dictionary containing the raw sample information and prediction results for every instance of the batch.
98
134
  """
99
- assert (
100
- isinstance(data, List) and len(data) == 1
101
- ), "data must be a list of length 1"
102
- assert isinstance(data[0], dict)
135
+ assert all(isinstance(i, dict) for i in data)
103
136
 
104
- data = data[0]
105
137
  src_data = copy.copy(data)
106
138
  # preprocess
107
- data = self.processor.preprocess(**data)
139
+ data = self.processor.preprocess(data)
108
140
  data = self._switch_inputs_to_device(data)
109
141
 
110
142
  # do infer
@@ -118,15 +150,38 @@ class DocVLMPredictor(BasePredictor):
118
150
  return result_dict
119
151
 
120
152
  def build_processor(self, **kwargs):
121
- from ..common.tokenizer import MIXQwen2Tokenizer
122
- from .processors import PPDocBeeProcessor, Qwen2VLImageProcessor
123
-
124
- if "PP-DocBee" in self.model_name:
153
+ from ..common.tokenizer import (
154
+ MIXQwen2_5_Tokenizer,
155
+ MIXQwen2Tokenizer,
156
+ QWenTokenizer,
157
+ )
158
+ from .processors import (
159
+ GOTImageProcessor,
160
+ PPChart2TableProcessor,
161
+ PPDocBee2Processor,
162
+ PPDocBeeProcessor,
163
+ Qwen2_5_VLImageProcessor,
164
+ Qwen2VLImageProcessor,
165
+ )
166
+
167
+ if self.model_name in self.model_group["PP-DocBee"]:
125
168
  image_processor = Qwen2VLImageProcessor()
126
169
  tokenizer = MIXQwen2Tokenizer.from_pretrained(self.model_dir)
127
170
  return PPDocBeeProcessor(
128
171
  image_processor=image_processor, tokenizer=tokenizer
129
172
  )
173
+ elif self.model_name in self.model_group["PP-Chart2Table"]:
174
+ image_processor = GOTImageProcessor(1024)
175
+ tokenizer = QWenTokenizer.from_pretrained(self.model_dir)
176
+ return PPChart2TableProcessor(
177
+ image_processor=image_processor, tokenizer=tokenizer, dtype=self.dtype
178
+ )
179
+ elif self.model_name in self.model_group["PP-DocBee2"]:
180
+ image_processor = Qwen2_5_VLImageProcessor()
181
+ tokenizer = MIXQwen2_5_Tokenizer.from_pretrained(self.model_dir)
182
+ return PPDocBee2Processor(
183
+ image_processor=image_processor, tokenizer=tokenizer
184
+ )
130
185
  else:
131
186
  raise NotImplementedError
132
187
 
@@ -0,0 +1,97 @@
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from typing import Dict, List, Union
16
+
17
+ import numpy as np
18
+ import paddle
19
+ import requests
20
+ from paddle.vision import transforms
21
+ from PIL import Image
22
+
23
+ from ....utils.benchmark import benchmark
24
+
25
+ MEAN = (0.48145466, 0.4578275, 0.40821073)
26
+ STD = (0.26862954, 0.26130258, 0.27577711)
27
+
28
+
29
+ class GOTImageProcessor(object):
30
+ def __init__(self, image_size=1024):
31
+
32
+ self.transform = transforms.Compose(
33
+ [
34
+ transforms.Resize((image_size, image_size), interpolation="bicubic"),
35
+ transforms.ToTensor(),
36
+ transforms.Normalize(MEAN, STD),
37
+ ]
38
+ )
39
+
40
+ def __call__(self, image):
41
+ return self.transform(image)
42
+
43
+
44
+ class PPChart2TableProcessor(object):
45
+ def __init__(self, image_processor, tokenizer, dtype, **kwargs):
46
+ self.image_processor = image_processor
47
+ self.tokenizer = tokenizer
48
+ self.dtype = dtype
49
+
50
+ prompt = (
51
+ "<|im_start|>system\n"
52
+ "You should follow the instructions carefully and explain your answers in detail.<|im_end|><|im_start|>user\n"
53
+ "<img>" + "<imgpad>" * 256 + "</img>\n"
54
+ "Chart to table<|im_end|><|im_start|>assistant\n"
55
+ )
56
+ self.input_ids = paddle.to_tensor(self.tokenizer([prompt]).input_ids)
57
+
58
+ @benchmark.timeit
59
+ def preprocess(self, image: Union[str, Image.Image, np.ndarray, Dict, List]):
60
+ if isinstance(image, (str, Image.Image, np.ndarray)):
61
+ image = [image]
62
+ elif isinstance(image, dict):
63
+ image = [image["image"]]
64
+
65
+ assert isinstance(image, list)
66
+ images = [
67
+ image_["image"] if isinstance(image_, dict) else image_ for image_ in image
68
+ ]
69
+ images = [
70
+ self.image_processor(self._load_image(image)).unsqueeze(0).to(self.dtype)
71
+ for image in images
72
+ ]
73
+ img_cnt = len(images)
74
+
75
+ input_ids = paddle.tile(self.input_ids, [img_cnt, 1])
76
+
77
+ return {"input_ids": input_ids, "images": images}
78
+
79
+ @benchmark.timeit
80
+ def postprocess(self, model_pred, *args, **kwargs):
81
+ return self.tokenizer.batch_decode(
82
+ model_pred[0], skip_special_tokens=True, clean_up_tokenization_spaces=False
83
+ )
84
+
85
+ def _load_image(self, image_file):
86
+ from io import BytesIO
87
+
88
+ if isinstance(image_file, Image.Image):
89
+ image = image_file.convert("RGB")
90
+ elif isinstance(image_file, np.ndarray):
91
+ image = Image.fromarray(image_file)
92
+ elif image_file.startswith("http") or image_file.startswith("https"):
93
+ response = requests.get(image_file)
94
+ image = Image.open(BytesIO(response.content)).convert("RGB")
95
+ else:
96
+ image = Image.open(image_file).convert("RGB")
97
+ return image
@@ -12,4 +12,6 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
+ from .GOT_ocr_2_0 import GOTImageProcessor, PPChart2TableProcessor
16
+ from .qwen2_5_vl import PPDocBee2Processor, Qwen2_5_VLImageProcessor
15
17
  from .qwen2_vl import PPDocBeeProcessor, Qwen2VLImageProcessor
@@ -12,13 +12,18 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
+ import base64
16
+ import math
15
17
  from collections import UserDict
18
+ from io import BytesIO
16
19
  from typing import Any, Dict, List, Optional, Tuple, Union
17
20
 
18
21
  import numpy as np
19
22
  import paddle
20
23
  import PIL.Image
24
+ import requests
21
25
  from packaging import version
26
+ from PIL import Image
22
27
 
23
28
  from ...common.tokenizer.tokenizer_utils_base import ExplicitEnum
24
29
 
@@ -370,3 +375,187 @@ class BatchFeature(UserDict):
370
375
  )
371
376
 
372
377
  return self
378
+
379
+
380
+ class PaddingStrategy(ExplicitEnum):
381
+ """
382
+ Possible values for the `padding` argument in [`PretrainedTokenizerBase.__call__`]. Useful for tab-completion in an
383
+ IDE.
384
+ """
385
+
386
+ LONGEST = "longest"
387
+ MAX_LENGTH = "max_length"
388
+ DO_NOT_PAD = "do_not_pad"
389
+
390
+
391
+ def extract_vision_info(
392
+ conversations: Union[List[dict], List[List[dict]]]
393
+ ) -> List[dict]:
394
+ vision_infos = []
395
+ if isinstance(conversations[0], dict):
396
+ conversations = [conversations]
397
+ for conversation in conversations:
398
+ for message in conversation:
399
+ if isinstance(message["content"], list):
400
+ for ele in message["content"]:
401
+ if (
402
+ "image" in ele
403
+ or "image_url" in ele
404
+ or ele["type"] in ("image", "image_url")
405
+ ):
406
+ vision_infos.append(ele)
407
+ return vision_infos
408
+
409
+
410
+ def process_vision_info(
411
+ conversations: Union[List[dict], List[List[dict]]],
412
+ ) -> Tuple[
413
+ Union[List[Image.Image], None, List[Union[paddle.Tensor, List[Image.Image]]], None]
414
+ ]:
415
+ vision_infos = extract_vision_info(conversations)
416
+ image_inputs = []
417
+ for vision_info in vision_infos:
418
+ if "image" in vision_info or "image_url" in vision_info:
419
+ image_inputs.append(fetch_image(vision_info))
420
+ else:
421
+ raise ValueError("image, image_url should in content.")
422
+ if len(image_inputs) == 0:
423
+ image_inputs = None
424
+ return image_inputs
425
+
426
+
427
+ def fetch_image(
428
+ ele: Dict[str, Union[str, Image.Image]],
429
+ size_factor: int,
430
+ min_pixels: int,
431
+ max_pixels: int,
432
+ max_ratio: float,
433
+ ) -> Image.Image:
434
+ if not isinstance(ele, dict):
435
+ ele = {"image": ele}
436
+ if "image" in ele:
437
+ image = ele["image"]
438
+ else:
439
+ image = ele["image_url"]
440
+ image_obj = None
441
+ if isinstance(image, Image.Image):
442
+ image_obj = image
443
+ elif isinstance(image, np.ndarray):
444
+ image_obj = Image.fromarray(image)
445
+ elif image.startswith("http://") or image.startswith("https://"):
446
+ image_obj = Image.open(requests.get(image, stream=True).raw)
447
+ elif image.startswith("file://"):
448
+ image_obj = Image.open(image[7:])
449
+ elif image.startswith("data:image"):
450
+ data = image.split(";", 1)[1]
451
+ if data.startswith("base64,"):
452
+ data = base64.b64decode(data[7:])
453
+ image_obj = Image.open(BytesIO(data))
454
+ else:
455
+ image_obj = Image.open(image)
456
+ if image_obj is None:
457
+ raise ValueError(
458
+ f"Unrecognized image input, support local path, http url, base64 and PIL.Image, got {image}"
459
+ )
460
+ image = image_obj.convert("RGB")
461
+ # resize
462
+ if "resized_height" in ele and "resized_width" in ele:
463
+ resized_height, resized_width = smart_resize(
464
+ ele["resized_height"],
465
+ ele["resized_width"],
466
+ factor=size_factor,
467
+ min_pixels=min_pixels,
468
+ max_pixels=max_pixels,
469
+ max_ratio=max_ratio,
470
+ )
471
+ else:
472
+ width, height = image.size # Image, not tensor
473
+ min_pixels = ele.get("min_pixels", min_pixels)
474
+ max_pixels = ele.get("max_pixels", max_pixels)
475
+ resized_height, resized_width = smart_resize(
476
+ height,
477
+ width,
478
+ factor=size_factor,
479
+ min_pixels=min_pixels,
480
+ max_pixels=max_pixels,
481
+ max_ratio=max_ratio,
482
+ )
483
+ image = image.resize((resized_width, resized_height))
484
+
485
+ return image
486
+
487
+
488
+ def round_by_factor(number: int, factor: int) -> int:
489
+ """Returns the closest integer to 'number' that is divisible by 'factor'."""
490
+ return round(number / factor) * factor
491
+
492
+
493
+ def ceil_by_factor(number: int, factor: int) -> int:
494
+ """Returns the smallest integer greater than or equal to 'number' that is divisible by 'factor'."""
495
+ return math.ceil(number / factor) * factor
496
+
497
+
498
+ def floor_by_factor(number: int, factor: int) -> int:
499
+ """Returns the largest integer less than or equal to 'number' that is divisible by 'factor'."""
500
+ return math.floor(number / factor) * factor
501
+
502
+
503
+ def smart_resize(
504
+ height: int,
505
+ width: int,
506
+ factor: int,
507
+ min_pixels: int,
508
+ max_pixels: int,
509
+ max_ratio: float,
510
+ ) -> Tuple[int, int]:
511
+ """
512
+ Rescales the image so that the following conditions are met:
513
+
514
+ 1. Both dimensions (height and width) are divisible by 'factor'.
515
+
516
+ 2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
517
+
518
+ 3. The aspect ratio of the image is maintained as closely as possible.
519
+ """
520
+ if max(height, width) / min(height, width) > max_ratio:
521
+ raise ValueError(
522
+ f"absolute aspect ratio must be smaller than {max_ratio}, got {max(height, width) / min(height, width)}"
523
+ )
524
+ h_bar = max(factor, round_by_factor(height, factor))
525
+ w_bar = max(factor, round_by_factor(width, factor))
526
+ if h_bar * w_bar > max_pixels:
527
+ beta = math.sqrt((height * width) / max_pixels)
528
+ h_bar = floor_by_factor(height / beta, factor)
529
+ w_bar = floor_by_factor(width / beta, factor)
530
+ elif h_bar * w_bar < min_pixels:
531
+ beta = math.sqrt(min_pixels / (height * width))
532
+ h_bar = ceil_by_factor(height * beta, factor)
533
+ w_bar = ceil_by_factor(width * beta, factor)
534
+ return h_bar, w_bar
535
+
536
+
537
+ def make_batched_images(images) -> List[List[ImageInput]]:
538
+ """
539
+ Accepts images in list or nested list format, and makes a list of images for preprocessing.
540
+
541
+ Args:
542
+ images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`):
543
+ The input image.
544
+
545
+ Returns:
546
+ list: A list of images.
547
+ """
548
+ if (
549
+ isinstance(images, (list, tuple))
550
+ and isinstance(images[0], (list, tuple))
551
+ and is_valid_image(images[0][0])
552
+ ):
553
+ return [img for img_list in images for img in img_list]
554
+
555
+ elif isinstance(images, (list, tuple)) and is_valid_image(images[0]):
556
+ return images
557
+
558
+ elif is_valid_image(images):
559
+ return [images]
560
+
561
+ raise ValueError(f"Could not make batched images from {images}")