paddlex 3.0.0rc1__py3-none-any.whl → 3.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- paddlex/.version +1 -1
- paddlex/__init__.py +1 -1
- paddlex/configs/modules/chart_parsing/PP-Chart2Table.yaml +13 -0
- paddlex/configs/modules/doc_vlm/PP-DocBee2-3B.yaml +14 -0
- paddlex/configs/modules/formula_recognition/PP-FormulaNet_plus-L.yaml +40 -0
- paddlex/configs/modules/formula_recognition/PP-FormulaNet_plus-M.yaml +40 -0
- paddlex/configs/modules/formula_recognition/PP-FormulaNet_plus-S.yaml +40 -0
- paddlex/configs/modules/layout_detection/PP-DocBlockLayout.yaml +40 -0
- paddlex/configs/modules/layout_detection/PP-DocLayout-L.yaml +2 -2
- paddlex/configs/modules/layout_detection/PP-DocLayout-M.yaml +2 -2
- paddlex/configs/modules/layout_detection/PP-DocLayout-S.yaml +2 -2
- paddlex/configs/modules/layout_detection/PP-DocLayout_plus-L.yaml +40 -0
- paddlex/configs/modules/text_detection/PP-OCRv5_mobile_det.yaml +40 -0
- paddlex/configs/modules/text_detection/PP-OCRv5_server_det.yaml +40 -0
- paddlex/configs/modules/text_recognition/PP-OCRv5_mobile_rec.yaml +39 -0
- paddlex/configs/modules/text_recognition/PP-OCRv5_server_rec.yaml +39 -0
- paddlex/configs/modules/textline_orientation/PP-LCNet_x1_0_textline_ori.yaml +41 -0
- paddlex/configs/pipelines/OCR.yaml +7 -6
- paddlex/configs/pipelines/PP-ChatOCRv3-doc.yaml +3 -1
- paddlex/configs/pipelines/PP-ChatOCRv4-doc.yaml +91 -34
- paddlex/configs/pipelines/PP-StructureV3.yaml +72 -72
- paddlex/configs/pipelines/doc_understanding.yaml +1 -1
- paddlex/configs/pipelines/formula_recognition.yaml +2 -2
- paddlex/configs/pipelines/layout_parsing.yaml +3 -2
- paddlex/configs/pipelines/seal_recognition.yaml +1 -0
- paddlex/configs/pipelines/table_recognition.yaml +2 -1
- paddlex/configs/pipelines/table_recognition_v2.yaml +7 -1
- paddlex/hpip_links.html +20 -20
- paddlex/inference/common/batch_sampler/doc_vlm_batch_sampler.py +33 -10
- paddlex/inference/common/batch_sampler/image_batch_sampler.py +34 -25
- paddlex/inference/common/result/mixin.py +19 -12
- paddlex/inference/models/base/predictor/base_predictor.py +2 -8
- paddlex/inference/models/common/static_infer.py +11 -59
- paddlex/inference/models/common/tokenizer/__init__.py +2 -0
- paddlex/inference/models/common/tokenizer/clip_tokenizer.py +1 -1
- paddlex/inference/models/common/tokenizer/gpt_tokenizer.py +2 -2
- paddlex/inference/models/common/tokenizer/qwen2_5_tokenizer.py +112 -0
- paddlex/inference/models/common/tokenizer/qwen2_tokenizer.py +7 -1
- paddlex/inference/models/common/tokenizer/qwen_tokenizer.py +288 -0
- paddlex/inference/models/common/tokenizer/tokenizer_utils.py +13 -13
- paddlex/inference/models/common/tokenizer/tokenizer_utils_base.py +3 -3
- paddlex/inference/models/common/tokenizer/vocab.py +7 -7
- paddlex/inference/models/common/vlm/conversion_utils.py +99 -0
- paddlex/inference/models/common/vlm/fusion_ops.py +205 -0
- paddlex/inference/models/common/vlm/generation/configuration_utils.py +1 -1
- paddlex/inference/models/common/vlm/generation/logits_process.py +1 -1
- paddlex/inference/models/common/vlm/generation/utils.py +1 -1
- paddlex/inference/models/common/vlm/transformers/configuration_utils.py +3 -3
- paddlex/inference/models/common/vlm/transformers/conversion_utils.py +3 -3
- paddlex/inference/models/common/vlm/transformers/model_outputs.py +2 -2
- paddlex/inference/models/common/vlm/transformers/model_utils.py +7 -31
- paddlex/inference/models/doc_vlm/modeling/GOT_ocr_2_0.py +830 -0
- paddlex/inference/models/doc_vlm/modeling/__init__.py +2 -0
- paddlex/inference/models/doc_vlm/modeling/qwen2.py +1606 -0
- paddlex/inference/models/doc_vlm/modeling/qwen2_5_vl.py +3006 -0
- paddlex/inference/models/doc_vlm/modeling/qwen2_vl.py +0 -105
- paddlex/inference/models/doc_vlm/predictor.py +79 -24
- paddlex/inference/models/doc_vlm/processors/GOT_ocr_2_0.py +97 -0
- paddlex/inference/models/doc_vlm/processors/__init__.py +2 -0
- paddlex/inference/models/doc_vlm/processors/common.py +189 -0
- paddlex/inference/models/doc_vlm/processors/qwen2_5_vl.py +548 -0
- paddlex/inference/models/doc_vlm/processors/qwen2_vl.py +21 -176
- paddlex/inference/models/formula_recognition/predictor.py +7 -1
- paddlex/inference/models/formula_recognition/processors.py +92 -79
- paddlex/inference/models/formula_recognition/result.py +28 -27
- paddlex/inference/models/image_feature/processors.py +3 -4
- paddlex/inference/models/keypoint_detection/predictor.py +3 -0
- paddlex/inference/models/object_detection/predictor.py +2 -0
- paddlex/inference/models/object_detection/processors.py +28 -3
- paddlex/inference/models/object_detection/utils.py +2 -0
- paddlex/inference/models/table_structure_recognition/result.py +0 -10
- paddlex/inference/models/text_detection/predictor.py +8 -0
- paddlex/inference/models/text_detection/processors.py +44 -10
- paddlex/inference/models/text_detection/result.py +0 -10
- paddlex/inference/pipelines/__init__.py +9 -5
- paddlex/inference/pipelines/_parallel.py +172 -0
- paddlex/inference/pipelines/anomaly_detection/pipeline.py +16 -6
- paddlex/inference/pipelines/attribute_recognition/pipeline.py +11 -1
- paddlex/inference/pipelines/base.py +14 -4
- paddlex/inference/pipelines/components/faisser.py +1 -1
- paddlex/inference/pipelines/doc_preprocessor/pipeline.py +53 -27
- paddlex/inference/pipelines/formula_recognition/pipeline.py +120 -82
- paddlex/inference/pipelines/formula_recognition/result.py +1 -11
- paddlex/inference/pipelines/image_classification/pipeline.py +16 -6
- paddlex/inference/pipelines/image_multilabel_classification/pipeline.py +16 -6
- paddlex/inference/pipelines/instance_segmentation/pipeline.py +16 -6
- paddlex/inference/pipelines/keypoint_detection/pipeline.py +16 -6
- paddlex/inference/pipelines/layout_parsing/pipeline.py +34 -47
- paddlex/inference/pipelines/layout_parsing/pipeline_v2.py +893 -260
- paddlex/inference/pipelines/layout_parsing/result.py +4 -17
- paddlex/inference/pipelines/layout_parsing/result_v2.py +523 -245
- paddlex/inference/pipelines/layout_parsing/setting.py +87 -0
- paddlex/inference/pipelines/layout_parsing/utils.py +565 -1998
- paddlex/inference/pipelines/layout_parsing/xycut_enhanced/__init__.py +16 -0
- paddlex/inference/pipelines/layout_parsing/xycut_enhanced/utils.py +1144 -0
- paddlex/inference/pipelines/layout_parsing/xycut_enhanced/xycuts.py +563 -0
- paddlex/inference/pipelines/m_3d_bev_detection/pipeline.py +2 -2
- paddlex/inference/pipelines/multilingual_speech_recognition/pipeline.py +2 -2
- paddlex/inference/pipelines/object_detection/pipeline.py +16 -6
- paddlex/inference/pipelines/ocr/pipeline.py +127 -70
- paddlex/inference/pipelines/ocr/result.py +19 -16
- paddlex/inference/pipelines/open_vocabulary_detection/pipeline.py +2 -2
- paddlex/inference/pipelines/open_vocabulary_segmentation/pipeline.py +2 -2
- paddlex/inference/pipelines/pp_chatocr/pipeline_base.py +2 -2
- paddlex/inference/pipelines/pp_chatocr/pipeline_v3.py +2 -5
- paddlex/inference/pipelines/pp_chatocr/pipeline_v4.py +5 -5
- paddlex/inference/pipelines/rotated_object_detection/pipeline.py +16 -6
- paddlex/inference/pipelines/seal_recognition/pipeline.py +109 -53
- paddlex/inference/pipelines/semantic_segmentation/pipeline.py +16 -6
- paddlex/inference/pipelines/small_object_detection/pipeline.py +16 -6
- paddlex/inference/pipelines/table_recognition/pipeline.py +26 -18
- paddlex/inference/pipelines/table_recognition/pipeline_v2.py +624 -53
- paddlex/inference/pipelines/table_recognition/result.py +1 -1
- paddlex/inference/pipelines/table_recognition/table_recognition_post_processing_v2.py +9 -5
- paddlex/inference/pipelines/ts_anomaly_detection/pipeline.py +2 -2
- paddlex/inference/pipelines/ts_classification/pipeline.py +2 -2
- paddlex/inference/pipelines/ts_forecasting/pipeline.py +2 -2
- paddlex/inference/pipelines/video_classification/pipeline.py +2 -2
- paddlex/inference/pipelines/video_detection/pipeline.py +2 -2
- paddlex/inference/serving/basic_serving/_pipeline_apps/_common/common.py +5 -1
- paddlex/inference/serving/basic_serving/_pipeline_apps/layout_parsing.py +0 -1
- paddlex/inference/serving/basic_serving/_pipeline_apps/pp_chatocrv3_doc.py +0 -1
- paddlex/inference/serving/basic_serving/_pipeline_apps/pp_chatocrv4_doc.py +1 -1
- paddlex/inference/serving/basic_serving/_pipeline_apps/pp_structurev3.py +6 -2
- paddlex/inference/serving/basic_serving/_pipeline_apps/table_recognition.py +1 -5
- paddlex/inference/serving/basic_serving/_pipeline_apps/table_recognition_v2.py +4 -5
- paddlex/inference/serving/infra/utils.py +20 -22
- paddlex/inference/serving/schemas/formula_recognition.py +1 -1
- paddlex/inference/serving/schemas/layout_parsing.py +1 -2
- paddlex/inference/serving/schemas/pp_chatocrv3_doc.py +1 -2
- paddlex/inference/serving/schemas/pp_chatocrv4_doc.py +2 -2
- paddlex/inference/serving/schemas/pp_structurev3.py +10 -6
- paddlex/inference/serving/schemas/seal_recognition.py +1 -1
- paddlex/inference/serving/schemas/table_recognition.py +2 -6
- paddlex/inference/serving/schemas/table_recognition_v2.py +5 -6
- paddlex/inference/utils/hpi.py +8 -1
- paddlex/inference/utils/hpi_model_info_collection.json +81 -2
- paddlex/inference/utils/io/readers.py +12 -12
- paddlex/inference/utils/mkldnn_blocklist.py +25 -0
- paddlex/inference/utils/official_models.py +14 -0
- paddlex/inference/utils/pp_option.py +29 -8
- paddlex/model.py +2 -2
- paddlex/modules/__init__.py +1 -1
- paddlex/modules/anomaly_detection/evaluator.py +2 -2
- paddlex/modules/base/__init__.py +1 -1
- paddlex/modules/base/evaluator.py +5 -5
- paddlex/modules/base/trainer.py +1 -1
- paddlex/modules/doc_vlm/dataset_checker.py +2 -2
- paddlex/modules/doc_vlm/evaluator.py +2 -2
- paddlex/modules/doc_vlm/exportor.py +2 -2
- paddlex/modules/doc_vlm/model_list.py +1 -1
- paddlex/modules/doc_vlm/trainer.py +2 -2
- paddlex/modules/face_recognition/evaluator.py +2 -2
- paddlex/modules/formula_recognition/evaluator.py +5 -2
- paddlex/modules/formula_recognition/model_list.py +3 -0
- paddlex/modules/formula_recognition/trainer.py +3 -0
- paddlex/modules/general_recognition/evaluator.py +1 -1
- paddlex/modules/image_classification/evaluator.py +2 -2
- paddlex/modules/image_classification/model_list.py +1 -0
- paddlex/modules/instance_segmentation/evaluator.py +1 -1
- paddlex/modules/keypoint_detection/evaluator.py +1 -1
- paddlex/modules/m_3d_bev_detection/evaluator.py +2 -2
- paddlex/modules/multilabel_classification/evaluator.py +2 -2
- paddlex/modules/object_detection/dataset_checker/dataset_src/convert_dataset.py +4 -4
- paddlex/modules/object_detection/evaluator.py +2 -2
- paddlex/modules/object_detection/model_list.py +2 -0
- paddlex/modules/semantic_segmentation/evaluator.py +2 -2
- paddlex/modules/table_recognition/evaluator.py +2 -2
- paddlex/modules/text_detection/evaluator.py +2 -2
- paddlex/modules/text_detection/model_list.py +2 -0
- paddlex/modules/text_recognition/evaluator.py +2 -2
- paddlex/modules/text_recognition/model_list.py +2 -0
- paddlex/modules/ts_anomaly_detection/evaluator.py +2 -2
- paddlex/modules/ts_classification/dataset_checker/dataset_src/split_dataset.py +1 -1
- paddlex/modules/ts_classification/evaluator.py +2 -2
- paddlex/modules/ts_forecast/evaluator.py +2 -2
- paddlex/modules/video_classification/evaluator.py +2 -2
- paddlex/modules/video_detection/evaluator.py +2 -2
- paddlex/ops/__init__.py +2 -2
- paddlex/paddlex_cli.py +19 -13
- paddlex/repo_apis/Paddle3D_api/bev_fusion/model.py +2 -2
- paddlex/repo_apis/PaddleClas_api/cls/config.py +1 -1
- paddlex/repo_apis/PaddleClas_api/cls/model.py +1 -1
- paddlex/repo_apis/PaddleClas_api/cls/register.py +10 -0
- paddlex/repo_apis/PaddleClas_api/cls/runner.py +1 -1
- paddlex/repo_apis/PaddleDetection_api/instance_seg/model.py +1 -1
- paddlex/repo_apis/PaddleDetection_api/instance_seg/runner.py +1 -1
- paddlex/repo_apis/PaddleDetection_api/object_det/config.py +1 -1
- paddlex/repo_apis/PaddleDetection_api/object_det/model.py +1 -1
- paddlex/repo_apis/PaddleDetection_api/object_det/official_categories.py +25 -0
- paddlex/repo_apis/PaddleDetection_api/object_det/register.py +30 -0
- paddlex/repo_apis/PaddleDetection_api/object_det/runner.py +1 -1
- paddlex/repo_apis/PaddleOCR_api/formula_rec/config.py +3 -3
- paddlex/repo_apis/PaddleOCR_api/formula_rec/model.py +5 -9
- paddlex/repo_apis/PaddleOCR_api/formula_rec/register.py +27 -0
- paddlex/repo_apis/PaddleOCR_api/formula_rec/runner.py +1 -1
- paddlex/repo_apis/PaddleOCR_api/table_rec/model.py +1 -1
- paddlex/repo_apis/PaddleOCR_api/table_rec/runner.py +1 -1
- paddlex/repo_apis/PaddleOCR_api/text_det/model.py +1 -1
- paddlex/repo_apis/PaddleOCR_api/text_det/register.py +18 -0
- paddlex/repo_apis/PaddleOCR_api/text_det/runner.py +1 -1
- paddlex/repo_apis/PaddleOCR_api/text_rec/config.py +3 -3
- paddlex/repo_apis/PaddleOCR_api/text_rec/model.py +5 -9
- paddlex/repo_apis/PaddleOCR_api/text_rec/register.py +18 -0
- paddlex/repo_apis/PaddleOCR_api/text_rec/runner.py +1 -1
- paddlex/repo_apis/PaddleSeg_api/seg/model.py +1 -1
- paddlex/repo_apis/PaddleSeg_api/seg/runner.py +1 -1
- paddlex/repo_apis/PaddleTS_api/ts_ad/config.py +3 -3
- paddlex/repo_apis/PaddleTS_api/ts_cls/config.py +2 -2
- paddlex/repo_apis/PaddleTS_api/ts_fc/config.py +4 -4
- paddlex/repo_apis/PaddleVideo_api/video_cls/config.py +1 -1
- paddlex/repo_apis/PaddleVideo_api/video_cls/model.py +1 -1
- paddlex/repo_apis/PaddleVideo_api/video_cls/runner.py +1 -1
- paddlex/repo_apis/PaddleVideo_api/video_det/config.py +1 -1
- paddlex/repo_apis/PaddleVideo_api/video_det/model.py +1 -1
- paddlex/repo_apis/PaddleVideo_api/video_det/runner.py +1 -1
- paddlex/repo_apis/base/config.py +1 -1
- paddlex/repo_manager/core.py +3 -3
- paddlex/repo_manager/meta.py +6 -2
- paddlex/repo_manager/repo.py +17 -16
- paddlex/utils/custom_device_list.py +26 -2
- paddlex/utils/deps.py +1 -1
- paddlex/utils/device.py +15 -8
- paddlex/utils/env.py +4 -0
- paddlex/utils/flags.py +2 -4
- paddlex/utils/fonts/__init__.py +34 -4
- paddlex/utils/misc.py +1 -1
- {paddlex-3.0.0rc1.dist-info → paddlex-3.0.1.dist-info}/METADATA +52 -56
- {paddlex-3.0.0rc1.dist-info → paddlex-3.0.1.dist-info}/RECORD +233 -206
- {paddlex-3.0.0rc1.dist-info → paddlex-3.0.1.dist-info}/WHEEL +1 -1
- {paddlex-3.0.0rc1.dist-info → paddlex-3.0.1.dist-info}/entry_points.txt +0 -0
- {paddlex-3.0.0rc1.dist-info → paddlex-3.0.1.dist-info}/licenses/LICENSE +0 -0
- {paddlex-3.0.0rc1.dist-info → paddlex-3.0.1.dist-info}/top_level.txt +0 -0
@@ -15,7 +15,6 @@
|
|
15
15
|
import math
|
16
16
|
import os
|
17
17
|
from dataclasses import dataclass
|
18
|
-
from functools import partial
|
19
18
|
from typing import Any, Dict, List, Optional, Tuple, Union
|
20
19
|
|
21
20
|
import paddle
|
@@ -1983,74 +1982,6 @@ class Qwen2VLForConditionalGeneration(Qwen2VLPreTrainedModel):
|
|
1983
1982
|
def get_decoder(self):
|
1984
1983
|
return self.model
|
1985
1984
|
|
1986
|
-
@classmethod
|
1987
|
-
def _get_tensor_parallel_mappings(cls, config: Qwen2VLConfig, is_split=True):
|
1988
|
-
|
1989
|
-
logging.info("Qwen2 inference model _get_tensor_parallel_mappings")
|
1990
|
-
|
1991
|
-
from paddlenlp.transformers.conversion_utils import split_or_merge_func
|
1992
|
-
|
1993
|
-
fn = split_or_merge_func(
|
1994
|
-
is_split=is_split,
|
1995
|
-
tensor_parallel_degree=config.tensor_parallel_degree,
|
1996
|
-
tensor_parallel_rank=config.tensor_parallel_rank,
|
1997
|
-
num_attention_heads=config.num_attention_heads,
|
1998
|
-
)
|
1999
|
-
|
2000
|
-
def get_tensor_parallel_split_mappings(num_layers):
|
2001
|
-
final_actions = {}
|
2002
|
-
|
2003
|
-
base_actions = {
|
2004
|
-
"lm_head.weight": partial(fn, is_column=True),
|
2005
|
-
# Row Linear
|
2006
|
-
"embed_tokens.weight": partial(fn, is_column=False),
|
2007
|
-
"layers.0.self_attn.o_proj.weight": partial(fn, is_column=False),
|
2008
|
-
"layers.0.mlp.down_proj.weight": partial(fn, is_column=False),
|
2009
|
-
}
|
2010
|
-
|
2011
|
-
base_actions["layers.0.self_attn.q_proj.weight"] = partial(
|
2012
|
-
fn, is_column=True
|
2013
|
-
)
|
2014
|
-
base_actions["layers.0.self_attn.q_proj.bias"] = partial(fn, is_column=True)
|
2015
|
-
# if we have enough num_key_value_heads to split, then split it.
|
2016
|
-
if config.num_key_value_heads % config.tensor_parallel_degree == 0:
|
2017
|
-
base_actions["layers.0.self_attn.k_proj.weight"] = partial(
|
2018
|
-
fn, is_column=True
|
2019
|
-
)
|
2020
|
-
base_actions["layers.0.self_attn.v_proj.weight"] = partial(
|
2021
|
-
fn, is_column=True
|
2022
|
-
)
|
2023
|
-
base_actions["layers.0.self_attn.k_proj.bias"] = partial(
|
2024
|
-
fn, is_column=True
|
2025
|
-
)
|
2026
|
-
base_actions["layers.0.self_attn.v_proj.bias"] = partial(
|
2027
|
-
fn, is_column=True
|
2028
|
-
)
|
2029
|
-
|
2030
|
-
if config.fuse_attention_ffn:
|
2031
|
-
base_actions["layers.0.mlp.gate_up_fused_proj.weight"] = partial(
|
2032
|
-
fn, is_column=True, is_naive_2fuse=True
|
2033
|
-
)
|
2034
|
-
else:
|
2035
|
-
base_actions["layers.0.mlp.gate_proj.weight"] = partial(
|
2036
|
-
fn, is_column=True
|
2037
|
-
)
|
2038
|
-
base_actions["layers.0.mlp.up_proj.weight"] = partial(
|
2039
|
-
fn, is_column=True
|
2040
|
-
)
|
2041
|
-
|
2042
|
-
for key, action in base_actions.items():
|
2043
|
-
if "layers.0." in key:
|
2044
|
-
for i in range(num_layers):
|
2045
|
-
final_actions[key.replace("layers.0.", f"layers.{i}.")] = action
|
2046
|
-
final_actions[key] = action
|
2047
|
-
|
2048
|
-
return final_actions
|
2049
|
-
|
2050
|
-
mappings = get_tensor_parallel_split_mappings(config.num_hidden_layers)
|
2051
|
-
|
2052
|
-
return mappings
|
2053
|
-
|
2054
1985
|
@staticmethod
|
2055
1986
|
def get_rope_index(
|
2056
1987
|
spatial_merge_size,
|
@@ -2276,42 +2207,6 @@ class Qwen2VLForConditionalGeneration(Qwen2VLPreTrainedModel):
|
|
2276
2207
|
|
2277
2208
|
return model_kwargs
|
2278
2209
|
|
2279
|
-
def vision_forward(
|
2280
|
-
self,
|
2281
|
-
input_ids: paddle.Tensor,
|
2282
|
-
inputs_embeds: Optional[paddle.Tensor] = None,
|
2283
|
-
attention_mask: Optional[paddle.Tensor] = None,
|
2284
|
-
position_ids: Optional[paddle.Tensor] = None,
|
2285
|
-
pixel_values: Optional[paddle.Tensor] = None,
|
2286
|
-
pixel_values_videos: Optional[paddle.Tensor] = None,
|
2287
|
-
image_grid_thw: Optional[paddle.Tensor] = None,
|
2288
|
-
video_grid_thw: Optional[paddle.Tensor] = None,
|
2289
|
-
rope_deltas: Optional[paddle.Tensor] = None,
|
2290
|
-
):
|
2291
|
-
|
2292
|
-
if inputs_embeds is None:
|
2293
|
-
from paddlenlp.experimental.transformers.qwen2.modeling import (
|
2294
|
-
Qwen2VLForConditionalGenerationBlockInferenceModel,
|
2295
|
-
)
|
2296
|
-
|
2297
|
-
assert isinstance(
|
2298
|
-
self.model, Qwen2VLForConditionalGenerationBlockInferenceModel
|
2299
|
-
), "model is not an instance of Qwen2VLForConditionalGenerationBlockInferenceModel"
|
2300
|
-
|
2301
|
-
inputs_embeds = self.model.qwen2.embed_tokens(input_ids)
|
2302
|
-
if pixel_values is not None:
|
2303
|
-
pixel_values = paddle.cast(pixel_values, paddle.bfloat16)
|
2304
|
-
image_embeds = self.visual(pixel_values, grid_thw=image_grid_thw)
|
2305
|
-
image_mask = input_ids == self.config.image_token_id
|
2306
|
-
|
2307
|
-
inputs_embeds[image_mask] = image_embeds
|
2308
|
-
if pixel_values_videos is not None:
|
2309
|
-
pixel_values_videos = paddle.cast(pixel_values_videos, paddle.bfloat16)
|
2310
|
-
video_embeds = self.visual(pixel_values_videos, grid_thw=video_grid_thw)
|
2311
|
-
video_mask = input_ids == self.config.video_token_id
|
2312
|
-
inputs_embeds[video_mask] = video_embeds
|
2313
|
-
return inputs_embeds
|
2314
|
-
|
2315
2210
|
def forward(
|
2316
2211
|
self,
|
2317
2212
|
input_ids: paddle.Tensor = None,
|
@@ -14,6 +14,7 @@
|
|
14
14
|
|
15
15
|
import copy
|
16
16
|
import os
|
17
|
+
import warnings
|
17
18
|
from typing import List
|
18
19
|
|
19
20
|
from ....modules.doc_vlm.model_list import MODELS
|
@@ -27,6 +28,11 @@ from .result import DocVLMResult
|
|
27
28
|
class DocVLMPredictor(BasePredictor):
|
28
29
|
|
29
30
|
entities = MODELS
|
31
|
+
model_group = {
|
32
|
+
"PP-DocBee": {"PP-DocBee-2B", "PP-DocBee-7B"},
|
33
|
+
"PP-DocBee2": {"PP-DocBee2-3B"},
|
34
|
+
"PP-Chart2Table": {"PP-Chart2Table"},
|
35
|
+
}
|
30
36
|
|
31
37
|
def __init__(self, *args, **kwargs):
|
32
38
|
"""Initializes DocVLMPredictor.
|
@@ -34,8 +40,17 @@ class DocVLMPredictor(BasePredictor):
|
|
34
40
|
*args: Arbitrary positional arguments passed to the superclass.
|
35
41
|
**kwargs: Arbitrary keyword arguments passed to the superclass.
|
36
42
|
"""
|
43
|
+
import paddle
|
44
|
+
|
37
45
|
super().__init__(*args, **kwargs)
|
38
46
|
self.device = kwargs.get("device", None)
|
47
|
+
self.dtype = (
|
48
|
+
"bfloat16"
|
49
|
+
if ("npu" in get_device_type() or paddle.amp.is_bfloat16_supported())
|
50
|
+
and (self.device is None or "cpu" not in self.device)
|
51
|
+
else "float32"
|
52
|
+
)
|
53
|
+
|
39
54
|
self.infer, self.processor = self._build(**kwargs)
|
40
55
|
|
41
56
|
def _build_batch_sampler(self):
|
@@ -44,7 +59,7 @@ class DocVLMPredictor(BasePredictor):
|
|
44
59
|
Returns:
|
45
60
|
DocVLMBatchSampler: An instance of DocVLMBatchSampler.
|
46
61
|
"""
|
47
|
-
return DocVLMBatchSampler()
|
62
|
+
return DocVLMBatchSampler(self.model_name)
|
48
63
|
|
49
64
|
def _get_result_class(self):
|
50
65
|
"""Returns the result class, DocVLMResult.
|
@@ -61,28 +76,49 @@ class DocVLMPredictor(BasePredictor):
|
|
61
76
|
model: An instance of Paddle model, could be either a dynamic model or a static model.
|
62
77
|
processor: The correspounding processor for the model.
|
63
78
|
"""
|
64
|
-
import
|
79
|
+
from .modeling import (
|
80
|
+
PPChart2TableInference,
|
81
|
+
PPDocBee2Inference,
|
82
|
+
PPDocBeeInference,
|
83
|
+
)
|
65
84
|
|
66
|
-
|
85
|
+
# build processor
|
86
|
+
processor = self.build_processor()
|
67
87
|
|
68
88
|
# build model
|
69
|
-
if "PP-DocBee"
|
89
|
+
if self.model_name in self.model_group["PP-DocBee"]:
|
70
90
|
if kwargs.get("use_hpip", False):
|
71
|
-
|
72
|
-
|
91
|
+
warnings.warn(
|
92
|
+
"The PP-DocBee series does not support `use_hpip=True` for now."
|
93
|
+
)
|
94
|
+
with TemporaryDeviceChanger(self.device):
|
95
|
+
model = PPDocBeeInference.from_pretrained(
|
96
|
+
self.model_dir, dtype=self.dtype
|
97
|
+
)
|
98
|
+
elif self.model_name in self.model_group["PP-Chart2Table"]:
|
99
|
+
if kwargs.get("use_hpip", False):
|
100
|
+
warnings.warn(
|
101
|
+
"The PP-Chart2Table series does not support `use_hpip=True` for now."
|
73
102
|
)
|
74
|
-
dtype = (
|
75
|
-
"bfloat16"
|
76
|
-
if ("npu" in get_device_type() or paddle.amp.is_bfloat16_supported())
|
77
|
-
else "float32"
|
78
|
-
)
|
79
103
|
with TemporaryDeviceChanger(self.device):
|
80
|
-
model =
|
104
|
+
model = PPChart2TableInference.from_pretrained(
|
105
|
+
self.model_dir,
|
106
|
+
dtype=self.dtype,
|
107
|
+
pad_token_id=processor.tokenizer.eos_token_id,
|
108
|
+
)
|
109
|
+
elif self.model_name in self.model_group["PP-DocBee2"]:
|
110
|
+
if kwargs.get("use_hpip", False):
|
111
|
+
warnings.warn(
|
112
|
+
"The PP-Chart2Table series does not support `use_hpip=True` for now."
|
113
|
+
)
|
114
|
+
with TemporaryDeviceChanger(self.device):
|
115
|
+
model = PPDocBee2Inference.from_pretrained(
|
116
|
+
self.model_dir,
|
117
|
+
dtype=self.dtype,
|
118
|
+
)
|
81
119
|
else:
|
82
120
|
raise NotImplementedError(f"Model {self.model_name} is not supported.")
|
83
121
|
|
84
|
-
# build processor
|
85
|
-
processor = self.build_processor()
|
86
122
|
return model, processor
|
87
123
|
|
88
124
|
def process(self, data: List[dict], **kwargs):
|
@@ -96,15 +132,11 @@ class DocVLMPredictor(BasePredictor):
|
|
96
132
|
Returns:
|
97
133
|
dict: A dictionary containing the raw sample information and prediction results for every instance of the batch.
|
98
134
|
"""
|
99
|
-
assert (
|
100
|
-
isinstance(data, List) and len(data) == 1
|
101
|
-
), "data must be a list of length 1"
|
102
|
-
assert isinstance(data[0], dict)
|
135
|
+
assert all(isinstance(i, dict) for i in data)
|
103
136
|
|
104
|
-
data = data[0]
|
105
137
|
src_data = copy.copy(data)
|
106
138
|
# preprocess
|
107
|
-
data = self.processor.preprocess(
|
139
|
+
data = self.processor.preprocess(data)
|
108
140
|
data = self._switch_inputs_to_device(data)
|
109
141
|
|
110
142
|
# do infer
|
@@ -118,15 +150,38 @@ class DocVLMPredictor(BasePredictor):
|
|
118
150
|
return result_dict
|
119
151
|
|
120
152
|
def build_processor(self, **kwargs):
|
121
|
-
from ..common.tokenizer import
|
122
|
-
|
123
|
-
|
124
|
-
|
153
|
+
from ..common.tokenizer import (
|
154
|
+
MIXQwen2_5_Tokenizer,
|
155
|
+
MIXQwen2Tokenizer,
|
156
|
+
QWenTokenizer,
|
157
|
+
)
|
158
|
+
from .processors import (
|
159
|
+
GOTImageProcessor,
|
160
|
+
PPChart2TableProcessor,
|
161
|
+
PPDocBee2Processor,
|
162
|
+
PPDocBeeProcessor,
|
163
|
+
Qwen2_5_VLImageProcessor,
|
164
|
+
Qwen2VLImageProcessor,
|
165
|
+
)
|
166
|
+
|
167
|
+
if self.model_name in self.model_group["PP-DocBee"]:
|
125
168
|
image_processor = Qwen2VLImageProcessor()
|
126
169
|
tokenizer = MIXQwen2Tokenizer.from_pretrained(self.model_dir)
|
127
170
|
return PPDocBeeProcessor(
|
128
171
|
image_processor=image_processor, tokenizer=tokenizer
|
129
172
|
)
|
173
|
+
elif self.model_name in self.model_group["PP-Chart2Table"]:
|
174
|
+
image_processor = GOTImageProcessor(1024)
|
175
|
+
tokenizer = QWenTokenizer.from_pretrained(self.model_dir)
|
176
|
+
return PPChart2TableProcessor(
|
177
|
+
image_processor=image_processor, tokenizer=tokenizer, dtype=self.dtype
|
178
|
+
)
|
179
|
+
elif self.model_name in self.model_group["PP-DocBee2"]:
|
180
|
+
image_processor = Qwen2_5_VLImageProcessor()
|
181
|
+
tokenizer = MIXQwen2_5_Tokenizer.from_pretrained(self.model_dir)
|
182
|
+
return PPDocBee2Processor(
|
183
|
+
image_processor=image_processor, tokenizer=tokenizer
|
184
|
+
)
|
130
185
|
else:
|
131
186
|
raise NotImplementedError
|
132
187
|
|
@@ -0,0 +1,97 @@
|
|
1
|
+
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
|
15
|
+
from typing import Dict, List, Union
|
16
|
+
|
17
|
+
import numpy as np
|
18
|
+
import paddle
|
19
|
+
import requests
|
20
|
+
from paddle.vision import transforms
|
21
|
+
from PIL import Image
|
22
|
+
|
23
|
+
from ....utils.benchmark import benchmark
|
24
|
+
|
25
|
+
MEAN = (0.48145466, 0.4578275, 0.40821073)
|
26
|
+
STD = (0.26862954, 0.26130258, 0.27577711)
|
27
|
+
|
28
|
+
|
29
|
+
class GOTImageProcessor(object):
|
30
|
+
def __init__(self, image_size=1024):
|
31
|
+
|
32
|
+
self.transform = transforms.Compose(
|
33
|
+
[
|
34
|
+
transforms.Resize((image_size, image_size), interpolation="bicubic"),
|
35
|
+
transforms.ToTensor(),
|
36
|
+
transforms.Normalize(MEAN, STD),
|
37
|
+
]
|
38
|
+
)
|
39
|
+
|
40
|
+
def __call__(self, image):
|
41
|
+
return self.transform(image)
|
42
|
+
|
43
|
+
|
44
|
+
class PPChart2TableProcessor(object):
|
45
|
+
def __init__(self, image_processor, tokenizer, dtype, **kwargs):
|
46
|
+
self.image_processor = image_processor
|
47
|
+
self.tokenizer = tokenizer
|
48
|
+
self.dtype = dtype
|
49
|
+
|
50
|
+
prompt = (
|
51
|
+
"<|im_start|>system\n"
|
52
|
+
"You should follow the instructions carefully and explain your answers in detail.<|im_end|><|im_start|>user\n"
|
53
|
+
"<img>" + "<imgpad>" * 256 + "</img>\n"
|
54
|
+
"Chart to table<|im_end|><|im_start|>assistant\n"
|
55
|
+
)
|
56
|
+
self.input_ids = paddle.to_tensor(self.tokenizer([prompt]).input_ids)
|
57
|
+
|
58
|
+
@benchmark.timeit
|
59
|
+
def preprocess(self, image: Union[str, Image.Image, np.ndarray, Dict, List]):
|
60
|
+
if isinstance(image, (str, Image.Image, np.ndarray)):
|
61
|
+
image = [image]
|
62
|
+
elif isinstance(image, dict):
|
63
|
+
image = [image["image"]]
|
64
|
+
|
65
|
+
assert isinstance(image, list)
|
66
|
+
images = [
|
67
|
+
image_["image"] if isinstance(image_, dict) else image_ for image_ in image
|
68
|
+
]
|
69
|
+
images = [
|
70
|
+
self.image_processor(self._load_image(image)).unsqueeze(0).to(self.dtype)
|
71
|
+
for image in images
|
72
|
+
]
|
73
|
+
img_cnt = len(images)
|
74
|
+
|
75
|
+
input_ids = paddle.tile(self.input_ids, [img_cnt, 1])
|
76
|
+
|
77
|
+
return {"input_ids": input_ids, "images": images}
|
78
|
+
|
79
|
+
@benchmark.timeit
|
80
|
+
def postprocess(self, model_pred, *args, **kwargs):
|
81
|
+
return self.tokenizer.batch_decode(
|
82
|
+
model_pred[0], skip_special_tokens=True, clean_up_tokenization_spaces=False
|
83
|
+
)
|
84
|
+
|
85
|
+
def _load_image(self, image_file):
|
86
|
+
from io import BytesIO
|
87
|
+
|
88
|
+
if isinstance(image_file, Image.Image):
|
89
|
+
image = image_file.convert("RGB")
|
90
|
+
elif isinstance(image_file, np.ndarray):
|
91
|
+
image = Image.fromarray(image_file)
|
92
|
+
elif image_file.startswith("http") or image_file.startswith("https"):
|
93
|
+
response = requests.get(image_file)
|
94
|
+
image = Image.open(BytesIO(response.content)).convert("RGB")
|
95
|
+
else:
|
96
|
+
image = Image.open(image_file).convert("RGB")
|
97
|
+
return image
|
@@ -12,4 +12,6 @@
|
|
12
12
|
# See the License for the specific language governing permissions and
|
13
13
|
# limitations under the License.
|
14
14
|
|
15
|
+
from .GOT_ocr_2_0 import GOTImageProcessor, PPChart2TableProcessor
|
16
|
+
from .qwen2_5_vl import PPDocBee2Processor, Qwen2_5_VLImageProcessor
|
15
17
|
from .qwen2_vl import PPDocBeeProcessor, Qwen2VLImageProcessor
|
@@ -12,13 +12,18 @@
|
|
12
12
|
# See the License for the specific language governing permissions and
|
13
13
|
# limitations under the License.
|
14
14
|
|
15
|
+
import base64
|
16
|
+
import math
|
15
17
|
from collections import UserDict
|
18
|
+
from io import BytesIO
|
16
19
|
from typing import Any, Dict, List, Optional, Tuple, Union
|
17
20
|
|
18
21
|
import numpy as np
|
19
22
|
import paddle
|
20
23
|
import PIL.Image
|
24
|
+
import requests
|
21
25
|
from packaging import version
|
26
|
+
from PIL import Image
|
22
27
|
|
23
28
|
from ...common.tokenizer.tokenizer_utils_base import ExplicitEnum
|
24
29
|
|
@@ -370,3 +375,187 @@ class BatchFeature(UserDict):
|
|
370
375
|
)
|
371
376
|
|
372
377
|
return self
|
378
|
+
|
379
|
+
|
380
|
+
class PaddingStrategy(ExplicitEnum):
|
381
|
+
"""
|
382
|
+
Possible values for the `padding` argument in [`PretrainedTokenizerBase.__call__`]. Useful for tab-completion in an
|
383
|
+
IDE.
|
384
|
+
"""
|
385
|
+
|
386
|
+
LONGEST = "longest"
|
387
|
+
MAX_LENGTH = "max_length"
|
388
|
+
DO_NOT_PAD = "do_not_pad"
|
389
|
+
|
390
|
+
|
391
|
+
def extract_vision_info(
|
392
|
+
conversations: Union[List[dict], List[List[dict]]]
|
393
|
+
) -> List[dict]:
|
394
|
+
vision_infos = []
|
395
|
+
if isinstance(conversations[0], dict):
|
396
|
+
conversations = [conversations]
|
397
|
+
for conversation in conversations:
|
398
|
+
for message in conversation:
|
399
|
+
if isinstance(message["content"], list):
|
400
|
+
for ele in message["content"]:
|
401
|
+
if (
|
402
|
+
"image" in ele
|
403
|
+
or "image_url" in ele
|
404
|
+
or ele["type"] in ("image", "image_url")
|
405
|
+
):
|
406
|
+
vision_infos.append(ele)
|
407
|
+
return vision_infos
|
408
|
+
|
409
|
+
|
410
|
+
def process_vision_info(
|
411
|
+
conversations: Union[List[dict], List[List[dict]]],
|
412
|
+
) -> Tuple[
|
413
|
+
Union[List[Image.Image], None, List[Union[paddle.Tensor, List[Image.Image]]], None]
|
414
|
+
]:
|
415
|
+
vision_infos = extract_vision_info(conversations)
|
416
|
+
image_inputs = []
|
417
|
+
for vision_info in vision_infos:
|
418
|
+
if "image" in vision_info or "image_url" in vision_info:
|
419
|
+
image_inputs.append(fetch_image(vision_info))
|
420
|
+
else:
|
421
|
+
raise ValueError("image, image_url should in content.")
|
422
|
+
if len(image_inputs) == 0:
|
423
|
+
image_inputs = None
|
424
|
+
return image_inputs
|
425
|
+
|
426
|
+
|
427
|
+
def fetch_image(
|
428
|
+
ele: Dict[str, Union[str, Image.Image]],
|
429
|
+
size_factor: int,
|
430
|
+
min_pixels: int,
|
431
|
+
max_pixels: int,
|
432
|
+
max_ratio: float,
|
433
|
+
) -> Image.Image:
|
434
|
+
if not isinstance(ele, dict):
|
435
|
+
ele = {"image": ele}
|
436
|
+
if "image" in ele:
|
437
|
+
image = ele["image"]
|
438
|
+
else:
|
439
|
+
image = ele["image_url"]
|
440
|
+
image_obj = None
|
441
|
+
if isinstance(image, Image.Image):
|
442
|
+
image_obj = image
|
443
|
+
elif isinstance(image, np.ndarray):
|
444
|
+
image_obj = Image.fromarray(image)
|
445
|
+
elif image.startswith("http://") or image.startswith("https://"):
|
446
|
+
image_obj = Image.open(requests.get(image, stream=True).raw)
|
447
|
+
elif image.startswith("file://"):
|
448
|
+
image_obj = Image.open(image[7:])
|
449
|
+
elif image.startswith("data:image"):
|
450
|
+
data = image.split(";", 1)[1]
|
451
|
+
if data.startswith("base64,"):
|
452
|
+
data = base64.b64decode(data[7:])
|
453
|
+
image_obj = Image.open(BytesIO(data))
|
454
|
+
else:
|
455
|
+
image_obj = Image.open(image)
|
456
|
+
if image_obj is None:
|
457
|
+
raise ValueError(
|
458
|
+
f"Unrecognized image input, support local path, http url, base64 and PIL.Image, got {image}"
|
459
|
+
)
|
460
|
+
image = image_obj.convert("RGB")
|
461
|
+
# resize
|
462
|
+
if "resized_height" in ele and "resized_width" in ele:
|
463
|
+
resized_height, resized_width = smart_resize(
|
464
|
+
ele["resized_height"],
|
465
|
+
ele["resized_width"],
|
466
|
+
factor=size_factor,
|
467
|
+
min_pixels=min_pixels,
|
468
|
+
max_pixels=max_pixels,
|
469
|
+
max_ratio=max_ratio,
|
470
|
+
)
|
471
|
+
else:
|
472
|
+
width, height = image.size # Image, not tensor
|
473
|
+
min_pixels = ele.get("min_pixels", min_pixels)
|
474
|
+
max_pixels = ele.get("max_pixels", max_pixels)
|
475
|
+
resized_height, resized_width = smart_resize(
|
476
|
+
height,
|
477
|
+
width,
|
478
|
+
factor=size_factor,
|
479
|
+
min_pixels=min_pixels,
|
480
|
+
max_pixels=max_pixels,
|
481
|
+
max_ratio=max_ratio,
|
482
|
+
)
|
483
|
+
image = image.resize((resized_width, resized_height))
|
484
|
+
|
485
|
+
return image
|
486
|
+
|
487
|
+
|
488
|
+
def round_by_factor(number: int, factor: int) -> int:
|
489
|
+
"""Returns the closest integer to 'number' that is divisible by 'factor'."""
|
490
|
+
return round(number / factor) * factor
|
491
|
+
|
492
|
+
|
493
|
+
def ceil_by_factor(number: int, factor: int) -> int:
|
494
|
+
"""Returns the smallest integer greater than or equal to 'number' that is divisible by 'factor'."""
|
495
|
+
return math.ceil(number / factor) * factor
|
496
|
+
|
497
|
+
|
498
|
+
def floor_by_factor(number: int, factor: int) -> int:
|
499
|
+
"""Returns the largest integer less than or equal to 'number' that is divisible by 'factor'."""
|
500
|
+
return math.floor(number / factor) * factor
|
501
|
+
|
502
|
+
|
503
|
+
def smart_resize(
|
504
|
+
height: int,
|
505
|
+
width: int,
|
506
|
+
factor: int,
|
507
|
+
min_pixels: int,
|
508
|
+
max_pixels: int,
|
509
|
+
max_ratio: float,
|
510
|
+
) -> Tuple[int, int]:
|
511
|
+
"""
|
512
|
+
Rescales the image so that the following conditions are met:
|
513
|
+
|
514
|
+
1. Both dimensions (height and width) are divisible by 'factor'.
|
515
|
+
|
516
|
+
2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
|
517
|
+
|
518
|
+
3. The aspect ratio of the image is maintained as closely as possible.
|
519
|
+
"""
|
520
|
+
if max(height, width) / min(height, width) > max_ratio:
|
521
|
+
raise ValueError(
|
522
|
+
f"absolute aspect ratio must be smaller than {max_ratio}, got {max(height, width) / min(height, width)}"
|
523
|
+
)
|
524
|
+
h_bar = max(factor, round_by_factor(height, factor))
|
525
|
+
w_bar = max(factor, round_by_factor(width, factor))
|
526
|
+
if h_bar * w_bar > max_pixels:
|
527
|
+
beta = math.sqrt((height * width) / max_pixels)
|
528
|
+
h_bar = floor_by_factor(height / beta, factor)
|
529
|
+
w_bar = floor_by_factor(width / beta, factor)
|
530
|
+
elif h_bar * w_bar < min_pixels:
|
531
|
+
beta = math.sqrt(min_pixels / (height * width))
|
532
|
+
h_bar = ceil_by_factor(height * beta, factor)
|
533
|
+
w_bar = ceil_by_factor(width * beta, factor)
|
534
|
+
return h_bar, w_bar
|
535
|
+
|
536
|
+
|
537
|
+
def make_batched_images(images) -> List[List[ImageInput]]:
|
538
|
+
"""
|
539
|
+
Accepts images in list or nested list format, and makes a list of images for preprocessing.
|
540
|
+
|
541
|
+
Args:
|
542
|
+
images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`):
|
543
|
+
The input image.
|
544
|
+
|
545
|
+
Returns:
|
546
|
+
list: A list of images.
|
547
|
+
"""
|
548
|
+
if (
|
549
|
+
isinstance(images, (list, tuple))
|
550
|
+
and isinstance(images[0], (list, tuple))
|
551
|
+
and is_valid_image(images[0][0])
|
552
|
+
):
|
553
|
+
return [img for img_list in images for img in img_list]
|
554
|
+
|
555
|
+
elif isinstance(images, (list, tuple)) and is_valid_image(images[0]):
|
556
|
+
return images
|
557
|
+
|
558
|
+
elif is_valid_image(images):
|
559
|
+
return [images]
|
560
|
+
|
561
|
+
raise ValueError(f"Could not make batched images from {images}")
|