mineru 2.2.2__py3-none-any.whl → 2.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mineru/backend/pipeline/pipeline_middle_json_mkcontent.py +3 -3
- mineru/backend/vlm/model_output_to_middle_json.py +123 -0
- mineru/backend/vlm/vlm_analyze.py +105 -16
- mineru/backend/vlm/vlm_magic_model.py +201 -135
- mineru/backend/vlm/vlm_middle_json_mkcontent.py +52 -11
- mineru/cli/client.py +6 -5
- mineru/cli/common.py +17 -16
- mineru/cli/fast_api.py +9 -7
- mineru/cli/gradio_app.py +15 -16
- mineru/cli/vlm_vllm_server.py +4 -0
- mineru/model/table/rec/unet_table/main.py +8 -0
- mineru/model/vlm_vllm_model/__init__.py +0 -0
- mineru/model/vlm_vllm_model/server.py +59 -0
- mineru/resources/header.html +10 -2
- mineru/utils/draw_bbox.py +32 -10
- mineru/utils/enum_class.py +16 -2
- mineru/utils/guess_suffix_or_lang.py +20 -0
- mineru/utils/span_block_fix.py +4 -2
- mineru/version.py +1 -1
- {mineru-2.2.2.dist-info → mineru-2.5.1.dist-info}/METADATA +70 -25
- {mineru-2.2.2.dist-info → mineru-2.5.1.dist-info}/RECORD +25 -38
- {mineru-2.2.2.dist-info → mineru-2.5.1.dist-info}/entry_points.txt +1 -1
- mineru/backend/vlm/base_predictor.py +0 -186
- mineru/backend/vlm/hf_predictor.py +0 -217
- mineru/backend/vlm/predictor.py +0 -111
- mineru/backend/vlm/sglang_client_predictor.py +0 -443
- mineru/backend/vlm/sglang_engine_predictor.py +0 -246
- mineru/backend/vlm/token_to_middle_json.py +0 -122
- mineru/backend/vlm/utils.py +0 -40
- mineru/cli/vlm_sglang_server.py +0 -4
- mineru/model/vlm_hf_model/__init__.py +0 -9
- mineru/model/vlm_hf_model/configuration_mineru2.py +0 -38
- mineru/model/vlm_hf_model/image_processing_mineru2.py +0 -269
- mineru/model/vlm_hf_model/modeling_mineru2.py +0 -449
- mineru/model/vlm_sglang_model/__init__.py +0 -14
- mineru/model/vlm_sglang_model/engine.py +0 -264
- mineru/model/vlm_sglang_model/image_processor.py +0 -213
- mineru/model/vlm_sglang_model/logit_processor.py +0 -90
- mineru/model/vlm_sglang_model/model.py +0 -453
- mineru/model/vlm_sglang_model/server.py +0 -75
- {mineru-2.2.2.dist-info → mineru-2.5.1.dist-info}/WHEEL +0 -0
- {mineru-2.2.2.dist-info → mineru-2.5.1.dist-info}/licenses/LICENSE.md +0 -0
- {mineru-2.2.2.dist-info → mineru-2.5.1.dist-info}/top_level.txt +0 -0
|
@@ -1,122 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
import time
|
|
3
|
-
from loguru import logger
|
|
4
|
-
import numpy as np
|
|
5
|
-
import cv2
|
|
6
|
-
from mineru.utils.config_reader import get_llm_aided_config, get_table_enable
|
|
7
|
-
from mineru.utils.cut_image import cut_image_and_table
|
|
8
|
-
from mineru.utils.enum_class import ContentType
|
|
9
|
-
from mineru.utils.hash_utils import str_md5
|
|
10
|
-
from mineru.backend.vlm.vlm_magic_model import MagicModel
|
|
11
|
-
from mineru.utils.pdf_image_tools import get_crop_img
|
|
12
|
-
from mineru.utils.pdf_reader import base64_to_pil_image
|
|
13
|
-
from mineru.utils.table_merge import merge_table
|
|
14
|
-
from mineru.version import __version__
|
|
15
|
-
|
|
16
|
-
heading_level_import_success = False
|
|
17
|
-
llm_aided_config = get_llm_aided_config()
|
|
18
|
-
if llm_aided_config:
|
|
19
|
-
title_aided_config = llm_aided_config.get('title_aided', {})
|
|
20
|
-
if title_aided_config.get('enable', False):
|
|
21
|
-
try:
|
|
22
|
-
from mineru.utils.llm_aided import llm_aided_title
|
|
23
|
-
from mineru.backend.pipeline.model_init import AtomModelSingleton
|
|
24
|
-
heading_level_import_success = True
|
|
25
|
-
except Exception as e:
|
|
26
|
-
logger.warning("The heading level feature cannot be used. If you need to use the heading level feature, "
|
|
27
|
-
"please execute `pip install mineru[core]` to install the required packages.")
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
def token_to_page_info(token, image_dict, page, image_writer, page_index) -> dict:
|
|
31
|
-
"""将token转换为页面信息"""
|
|
32
|
-
# 解析token,提取坐标和类型
|
|
33
|
-
# 假设token格式为:<|box_start|>x0 y0 x1 y1<|box_end|><|ref_start|>type<|ref_end|><|md_start|>content<|md_end|>
|
|
34
|
-
# 这里需要根据实际的token格式进行解析
|
|
35
|
-
# 提取所有完整块,每个块从<|box_start|>开始到<|md_end|>或<|im_end|>结束
|
|
36
|
-
|
|
37
|
-
scale = image_dict["scale"]
|
|
38
|
-
# page_pil_img = image_dict["img_pil"]
|
|
39
|
-
page_pil_img = base64_to_pil_image(image_dict["img_base64"])
|
|
40
|
-
page_img_md5 = str_md5(image_dict["img_base64"])
|
|
41
|
-
width, height = map(int, page.get_size())
|
|
42
|
-
|
|
43
|
-
magic_model = MagicModel(token, width, height)
|
|
44
|
-
image_blocks = magic_model.get_image_blocks()
|
|
45
|
-
table_blocks = magic_model.get_table_blocks()
|
|
46
|
-
title_blocks = magic_model.get_title_blocks()
|
|
47
|
-
|
|
48
|
-
# 如果有标题优化需求,则对title_blocks截图det
|
|
49
|
-
if heading_level_import_success:
|
|
50
|
-
atom_model_manager = AtomModelSingleton()
|
|
51
|
-
ocr_model = atom_model_manager.get_atom_model(
|
|
52
|
-
atom_model_name='ocr',
|
|
53
|
-
ocr_show_log=False,
|
|
54
|
-
det_db_box_thresh=0.3,
|
|
55
|
-
lang='ch_lite'
|
|
56
|
-
)
|
|
57
|
-
for title_block in title_blocks:
|
|
58
|
-
title_pil_img = get_crop_img(title_block['bbox'], page_pil_img, scale)
|
|
59
|
-
title_np_img = np.array(title_pil_img)
|
|
60
|
-
# 给title_pil_img添加上下左右各50像素白边padding
|
|
61
|
-
title_np_img = cv2.copyMakeBorder(
|
|
62
|
-
title_np_img, 50, 50, 50, 50, cv2.BORDER_CONSTANT, value=[255, 255, 255]
|
|
63
|
-
)
|
|
64
|
-
title_img = cv2.cvtColor(title_np_img, cv2.COLOR_RGB2BGR)
|
|
65
|
-
ocr_det_res = ocr_model.ocr(title_img, rec=False)[0]
|
|
66
|
-
if len(ocr_det_res) > 0:
|
|
67
|
-
# 计算所有res的平均高度
|
|
68
|
-
avg_height = np.mean([box[2][1] - box[0][1] for box in ocr_det_res])
|
|
69
|
-
title_block['line_avg_height'] = round(avg_height/scale)
|
|
70
|
-
|
|
71
|
-
text_blocks = magic_model.get_text_blocks()
|
|
72
|
-
interline_equation_blocks = magic_model.get_interline_equation_blocks()
|
|
73
|
-
|
|
74
|
-
all_spans = magic_model.get_all_spans()
|
|
75
|
-
# 对image/table/interline_equation的span截图
|
|
76
|
-
for span in all_spans:
|
|
77
|
-
if span["type"] in [ContentType.IMAGE, ContentType.TABLE, ContentType.INTERLINE_EQUATION]:
|
|
78
|
-
span = cut_image_and_table(span, page_pil_img, page_img_md5, page_index, image_writer, scale=scale)
|
|
79
|
-
|
|
80
|
-
page_blocks = []
|
|
81
|
-
page_blocks.extend([*image_blocks, *table_blocks, *title_blocks, *text_blocks, *interline_equation_blocks])
|
|
82
|
-
# 对page_blocks根据index的值进行排序
|
|
83
|
-
page_blocks.sort(key=lambda x: x["index"])
|
|
84
|
-
|
|
85
|
-
page_info = {"para_blocks": page_blocks, "discarded_blocks": [], "page_size": [width, height], "page_idx": page_index}
|
|
86
|
-
return page_info
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
def result_to_middle_json(token_list, images_list, pdf_doc, image_writer):
|
|
90
|
-
middle_json = {"pdf_info": [], "_backend":"vlm", "_version_name": __version__}
|
|
91
|
-
for index, token in enumerate(token_list):
|
|
92
|
-
page = pdf_doc[index]
|
|
93
|
-
image_dict = images_list[index]
|
|
94
|
-
page_info = token_to_page_info(token, image_dict, page, image_writer, index)
|
|
95
|
-
middle_json["pdf_info"].append(page_info)
|
|
96
|
-
|
|
97
|
-
"""表格跨页合并"""
|
|
98
|
-
table_enable = get_table_enable(os.getenv('MINERU_VLM_TABLE_ENABLE', 'True').lower() == 'true')
|
|
99
|
-
if table_enable:
|
|
100
|
-
merge_table(middle_json["pdf_info"])
|
|
101
|
-
|
|
102
|
-
"""llm优化标题分级"""
|
|
103
|
-
if heading_level_import_success:
|
|
104
|
-
llm_aided_title_start_time = time.time()
|
|
105
|
-
llm_aided_title(middle_json["pdf_info"], title_aided_config)
|
|
106
|
-
logger.info(f'llm aided title time: {round(time.time() - llm_aided_title_start_time, 2)}')
|
|
107
|
-
|
|
108
|
-
# 关闭pdf文档
|
|
109
|
-
pdf_doc.close()
|
|
110
|
-
return middle_json
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
if __name__ == "__main__":
|
|
114
|
-
|
|
115
|
-
output = r"<|box_start|>088 119 472 571<|box_end|><|ref_start|>image<|ref_end|><|md_start|><|md_end|>\n<|box_start|>079 582 482 608<|box_end|><|ref_start|>image_caption<|ref_end|><|md_start|>Fig. 2. (a) Schematic of the change in the FDC over time, and (b) definition of model parameters.<|md_end|>\n<|box_start|>079 624 285 638<|box_end|><|ref_start|>title<|ref_end|><|md_start|># 2.2. Zero flow day analysis<|md_end|>\n<|box_start|>079 656 482 801<|box_end|><|ref_start|>text<|ref_end|><|md_start|>A notable feature of Fig. 1 is the increase in the number of zero flow days. A similar approach to Eq. (2), using an inverse sigmoidal function was employed to assess the impact of afforestation on the number of zero flow days per year \((N_{\mathrm{zero}})\). In this case, the left hand side of Eq. (2) is replaced by \(N_{\mathrm{zero}}\) and \(b\) and \(S\) are constrained to negative as \(N_{\mathrm{zero}}\) decreases as rainfall increases, and increases with plantation growth:<|md_end|>\n<|box_start|>076 813 368 853<|box_end|><|ref_start|>equation<|ref_end|><|md_start|>\[\nN_{\mathrm{zero}}=a+b(\Delta P)+\frac{Y}{1+\exp\left(\frac{T-T_{\mathrm{half}}}{S}\right)}\n\]<|md_end|>\n<|box_start|>079 865 482 895<|box_end|><|ref_start|>text<|ref_end|><|md_start|>For the average pre-treatment condition \(\Delta P=0\) and \(T=0\), \(N_{\mathrm{zero}}\) approximately equals \(a\). \(Y\) gives<|md_end|>\n<|box_start|>525 119 926 215<|box_end|><|ref_start|>text<|ref_end|><|md_start|>the magnitude of change in zero flow days due to afforestation, and \(S\) describes the shape of the response. For the average climate condition \(\Delta P=0\), \(a+Y\) becomes the number of zero flow days when the new equilibrium condition under afforestation is reached.<|md_end|>\n<|box_start|>525 240 704 253<|box_end|><|ref_start|>title<|ref_end|><|md_start|># 2.3. Statistical analyses<|md_end|>\n<|box_start|>525 271 926 368<|box_end|><|ref_start|>text<|ref_end|><|md_start|>The coefficient of efficiency \((E)\) (Nash and Sutcliffe, 1970; Chiew and McMahon, 1993; Legates and McCabe, 1999) was used as the 'goodness of fit' measure to evaluate the fit between observed and predicted flow deciles (2) and zero flow days (3). \(E\) is given by:<|md_end|>\n<|box_start|>520 375 735 415<|box_end|><|ref_start|>equation<|ref_end|><|md_start|>\[\nE=1.0-\frac{\sum_{i=1}^{N}(O_{i}-P_{i})^{2}}{\sum_{i=1}^{N}(O_{i}-\bar{O})^{2}}\n\]<|md_end|>\n<|box_start|>525 424 926 601<|box_end|><|ref_start|>text<|ref_end|><|md_start|>where \(O\) are observed data, \(P\) are predicted values, and \(\bar{O}\) is the mean for the entire period. \(E\) is unity minus the ratio of the mean square error to the variance in the observed data, and ranges from \(-\infty\) to 1.0. Higher values indicate greater agreement between observed and predicted data as per the coefficient of determination \((r^{2})\). \(E\) is used in preference to \(r^{2}\) in evaluating hydrologic modelling because it is a measure of the deviation from the 1:1 line. As \(E\) is always \(<r^{2}\) we have arbitrarily considered \(E>0.7\) to indicate adequate model fits.<|md_end|>\n<|box_start|>525 603 926 731<|box_end|><|ref_start|>text<|ref_end|><|md_start|>It is important to assess the significance of the model parameters to check the model assumptions that rainfall and forest age are driving changes in the FDC. The model (2) was split into simplified forms, where only the rainfall or time terms were included by setting \(b=0\), as shown in Eq. (5), or \(Y=0\) as shown in Eq. (6). The component models (5) and (6) were then tested against the complete model, (2).<|md_end|>\n<|box_start|>520 739 735 778<|box_end|><|ref_start|>equation<|ref_end|><|md_start|>\[\nQ_{\%}=a+\frac{Y}{1+\exp\left(\frac{T-T_{\mathrm{half}}^{\prime}}{S}\right)}\n\]<|md_end|>\n<|box_start|>525 787 553 799<|box_end|><|ref_start|>text<|ref_end|><|md_start|>and<|md_end|>\n<|box_start|>520 807 646 825<|box_end|><|ref_start|>equation<|ref_end|><|md_start|>\[\nQ_{\%}=a+b\Delta P\n\]<|md_end|>\n<|box_start|>525 833 926 895<|box_end|><|ref_start|>text<|ref_end|><|md_start|>For both the flow duration curve analysis and zero flow days analysis, a \(t\)-test was then performed to test whether (5) and (6) were significantly different to (2). A critical value of \(t\) exceeding the calculated \(t\)-value<|md_end|><|im_end|>"
|
|
116
|
-
|
|
117
|
-
p_info = token_to_page_info(output)
|
|
118
|
-
# 将blocks 转换为json文本
|
|
119
|
-
import json
|
|
120
|
-
|
|
121
|
-
json_str = json.dumps(p_info, ensure_ascii=False, indent=4)
|
|
122
|
-
print(json_str)
|
mineru/backend/vlm/utils.py
DELETED
|
@@ -1,40 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
import re
|
|
3
|
-
from base64 import b64decode
|
|
4
|
-
|
|
5
|
-
import httpx
|
|
6
|
-
|
|
7
|
-
_timeout = int(os.getenv("REQUEST_TIMEOUT", "3"))
|
|
8
|
-
_file_exts = (".png", ".jpg", ".jpeg", ".webp", ".gif", ".pdf")
|
|
9
|
-
_data_uri_regex = re.compile(r"^data:[^;,]+;base64,")
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
def load_resource(uri: str) -> bytes:
|
|
13
|
-
if uri.startswith("http://") or uri.startswith("https://"):
|
|
14
|
-
response = httpx.get(uri, timeout=_timeout)
|
|
15
|
-
return response.content
|
|
16
|
-
if uri.startswith("file://"):
|
|
17
|
-
with open(uri[len("file://") :], "rb") as file:
|
|
18
|
-
return file.read()
|
|
19
|
-
if uri.lower().endswith(_file_exts):
|
|
20
|
-
with open(uri, "rb") as file:
|
|
21
|
-
return file.read()
|
|
22
|
-
if re.match(_data_uri_regex, uri):
|
|
23
|
-
return b64decode(uri.split(",")[1])
|
|
24
|
-
return b64decode(uri)
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
async def aio_load_resource(uri: str) -> bytes:
|
|
28
|
-
if uri.startswith("http://") or uri.startswith("https://"):
|
|
29
|
-
async with httpx.AsyncClient(timeout=_timeout) as client:
|
|
30
|
-
response = await client.get(uri)
|
|
31
|
-
return response.content
|
|
32
|
-
if uri.startswith("file://"):
|
|
33
|
-
with open(uri[len("file://") :], "rb") as file:
|
|
34
|
-
return file.read()
|
|
35
|
-
if uri.lower().endswith(_file_exts):
|
|
36
|
-
with open(uri, "rb") as file:
|
|
37
|
-
return file.read()
|
|
38
|
-
if re.match(_data_uri_regex, uri):
|
|
39
|
-
return b64decode(uri.split(",")[1])
|
|
40
|
-
return b64decode(uri)
|
mineru/cli/vlm_sglang_server.py
DELETED
|
@@ -1,9 +0,0 @@
|
|
|
1
|
-
from transformers import AutoConfig, AutoImageProcessor, AutoModelForCausalLM
|
|
2
|
-
|
|
3
|
-
from .configuration_mineru2 import Mineru2QwenConfig
|
|
4
|
-
from .image_processing_mineru2 import Mineru2ImageProcessor
|
|
5
|
-
from .modeling_mineru2 import Mineru2QwenForCausalLM
|
|
6
|
-
|
|
7
|
-
AutoConfig.register(Mineru2QwenConfig.model_type, Mineru2QwenConfig)
|
|
8
|
-
AutoModelForCausalLM.register(Mineru2QwenConfig, Mineru2QwenForCausalLM)
|
|
9
|
-
AutoImageProcessor.register(Mineru2QwenConfig, slow_image_processor_class=Mineru2ImageProcessor)
|
|
@@ -1,38 +0,0 @@
|
|
|
1
|
-
from transformers import Qwen2Config
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
class Mineru2QwenConfig(Qwen2Config):
|
|
5
|
-
model_type = "mineru2_qwen"
|
|
6
|
-
|
|
7
|
-
def __init__(
|
|
8
|
-
self,
|
|
9
|
-
ignore_index=-100,
|
|
10
|
-
image_aspect_ratio="square_anyres_max_9",
|
|
11
|
-
image_grid_pinpoints="(1x1),...,(4x4)",
|
|
12
|
-
image_token_index=151646,
|
|
13
|
-
mm_hidden_size=1152,
|
|
14
|
-
mm_patch_merge_type="spatial_unpad",
|
|
15
|
-
mm_projector_type="mlp2x_gelu",
|
|
16
|
-
mm_vision_select_feature="full",
|
|
17
|
-
mm_vision_select_layer=-2,
|
|
18
|
-
mm_vision_tower="google/siglip-so400m-patch14-384",
|
|
19
|
-
tie_word_embeddings=False,
|
|
20
|
-
tokenizer_model_max_length=16384,
|
|
21
|
-
tokenizer_padding_side="right",
|
|
22
|
-
unfreeze_mm_vision_tower=True,
|
|
23
|
-
**kwargs,
|
|
24
|
-
):
|
|
25
|
-
self.ignore_index = ignore_index
|
|
26
|
-
self.image_aspect_ratio = image_aspect_ratio
|
|
27
|
-
self.image_grid_pinpoints = image_grid_pinpoints
|
|
28
|
-
self.image_token_index = image_token_index
|
|
29
|
-
self.mm_hidden_size = mm_hidden_size
|
|
30
|
-
self.mm_patch_merge_type = mm_patch_merge_type
|
|
31
|
-
self.mm_projector_type = mm_projector_type
|
|
32
|
-
self.mm_vision_select_feature = mm_vision_select_feature
|
|
33
|
-
self.mm_vision_select_layer = mm_vision_select_layer
|
|
34
|
-
self.mm_vision_tower = mm_vision_tower
|
|
35
|
-
self.tokenizer_model_max_length = tokenizer_model_max_length
|
|
36
|
-
self.tokenizer_padding_side = tokenizer_padding_side
|
|
37
|
-
self.unfreeze_mm_vision_tower = unfreeze_mm_vision_tower
|
|
38
|
-
super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
|
|
@@ -1,269 +0,0 @@
|
|
|
1
|
-
import ast
|
|
2
|
-
import math
|
|
3
|
-
import re
|
|
4
|
-
from functools import partial, reduce
|
|
5
|
-
from typing import Dict, Optional, Union
|
|
6
|
-
|
|
7
|
-
import numpy as np
|
|
8
|
-
import torch
|
|
9
|
-
from PIL import Image
|
|
10
|
-
from transformers.image_processing_utils import (
|
|
11
|
-
BaseImageProcessor,
|
|
12
|
-
BatchFeature,
|
|
13
|
-
get_size_dict,
|
|
14
|
-
)
|
|
15
|
-
from transformers.image_transforms import (
|
|
16
|
-
convert_to_rgb,
|
|
17
|
-
normalize,
|
|
18
|
-
rescale,
|
|
19
|
-
resize,
|
|
20
|
-
to_channel_dimension_format,
|
|
21
|
-
)
|
|
22
|
-
from transformers.image_utils import (
|
|
23
|
-
ChannelDimension,
|
|
24
|
-
PILImageResampling,
|
|
25
|
-
to_numpy_array,
|
|
26
|
-
)
|
|
27
|
-
from transformers.utils import TensorType
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
def select_best_resolution(original_size: tuple, possible_resolutions: list) -> tuple:
|
|
31
|
-
original_width, original_height = original_size
|
|
32
|
-
best_fit = (0, 0)
|
|
33
|
-
max_effective_resolution = 0
|
|
34
|
-
min_wasted_resolution = float("inf")
|
|
35
|
-
|
|
36
|
-
for width, height in possible_resolutions:
|
|
37
|
-
scale = min(width / original_width, height / original_height)
|
|
38
|
-
downscaled_width, downscaled_height = int(original_width * scale), int(original_height * scale)
|
|
39
|
-
effective_resolution = min(downscaled_width * downscaled_height, original_width * original_height)
|
|
40
|
-
wasted_resolution = (width * height) - effective_resolution
|
|
41
|
-
|
|
42
|
-
if effective_resolution > max_effective_resolution or (
|
|
43
|
-
effective_resolution == max_effective_resolution and wasted_resolution < min_wasted_resolution
|
|
44
|
-
):
|
|
45
|
-
max_effective_resolution = effective_resolution
|
|
46
|
-
min_wasted_resolution = wasted_resolution
|
|
47
|
-
best_fit = (width, height)
|
|
48
|
-
|
|
49
|
-
return best_fit
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
def divide_to_patches(image, patch_size):
|
|
53
|
-
patches = []
|
|
54
|
-
width, height = image.size
|
|
55
|
-
for i in range(0, height, patch_size):
|
|
56
|
-
for j in range(0, width, patch_size):
|
|
57
|
-
box = (j, i, j + patch_size, i + patch_size)
|
|
58
|
-
patch = image.crop(box)
|
|
59
|
-
patches.append(patch)
|
|
60
|
-
return patches
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
def expand2square(pil_img, background_color):
|
|
64
|
-
width, height = pil_img.size
|
|
65
|
-
if width == height:
|
|
66
|
-
return pil_img
|
|
67
|
-
if pil_img.mode == "L":
|
|
68
|
-
pil_img = pil_img.convert("RGB")
|
|
69
|
-
if width > height:
|
|
70
|
-
result = Image.new(pil_img.mode, (width, width), background_color)
|
|
71
|
-
result.paste(pil_img, (0, (width - height) // 2))
|
|
72
|
-
return result
|
|
73
|
-
else:
|
|
74
|
-
result = Image.new(pil_img.mode, (height, height), background_color)
|
|
75
|
-
result.paste(pil_img, ((height - width) // 2, 0))
|
|
76
|
-
return result
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size):
|
|
80
|
-
if isinstance(grid_pinpoints, str) and "x" in grid_pinpoints:
|
|
81
|
-
assert patch_size in [224, 336, 384, 448, 512], "patch_size should be in [224, 336, 384, 448, 512]"
|
|
82
|
-
matches = re.findall(r"\((\d+)x(\d+)\)", grid_pinpoints)
|
|
83
|
-
range_start = tuple(map(int, matches[0]))
|
|
84
|
-
range_end = tuple(map(int, matches[-1]))
|
|
85
|
-
grid_pinpoints = [
|
|
86
|
-
(i, j) for i in range(range_start[0], range_end[0] + 1) for j in range(range_start[1], range_end[1] + 1)
|
|
87
|
-
]
|
|
88
|
-
grid_pinpoints = [[dim * patch_size for dim in pair] for pair in grid_pinpoints]
|
|
89
|
-
if type(grid_pinpoints) is list:
|
|
90
|
-
possible_resolutions = grid_pinpoints
|
|
91
|
-
else:
|
|
92
|
-
possible_resolutions = ast.literal_eval(grid_pinpoints) # type: ignore
|
|
93
|
-
width, height = select_best_resolution(image_size, possible_resolutions)
|
|
94
|
-
return width // patch_size, height // patch_size
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
# This functions is not used.
|
|
98
|
-
def resize_and_pad_image(image, target_resolution):
|
|
99
|
-
original_width, original_height = image.size
|
|
100
|
-
target_width, target_height = target_resolution
|
|
101
|
-
|
|
102
|
-
scale_w = target_width / original_width
|
|
103
|
-
scale_h = target_height / original_height
|
|
104
|
-
|
|
105
|
-
if scale_w < scale_h:
|
|
106
|
-
new_width = target_width
|
|
107
|
-
new_height = min(math.ceil(original_height * scale_w), target_height)
|
|
108
|
-
else:
|
|
109
|
-
new_height = target_height
|
|
110
|
-
new_width = min(math.ceil(original_width * scale_h), target_width)
|
|
111
|
-
|
|
112
|
-
# Resize the image
|
|
113
|
-
resized_image = image.resize((new_width, new_height))
|
|
114
|
-
|
|
115
|
-
new_image = Image.new("RGB", (target_width, target_height), (0, 0, 0))
|
|
116
|
-
paste_x = (target_width - new_width) // 2
|
|
117
|
-
paste_y = (target_height - new_height) // 2
|
|
118
|
-
new_image.paste(resized_image, (paste_x, paste_y))
|
|
119
|
-
|
|
120
|
-
return new_image
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
# DIFFERENT from sglang.srt.mm_utils.process_anyres_image
|
|
124
|
-
def process_anyres_image(image, processor, grid_pinpoints):
|
|
125
|
-
if isinstance(grid_pinpoints, str) and "x" in grid_pinpoints:
|
|
126
|
-
patch_size = processor.crop_size["height"]
|
|
127
|
-
assert patch_size in [224, 336, 384, 448, 512], "patch_size should be in [224, 336, 384, 448, 512]"
|
|
128
|
-
matches = re.findall(r"\((\d+)x(\d+)\)", grid_pinpoints)
|
|
129
|
-
range_start = tuple(map(int, matches[0]))
|
|
130
|
-
range_end = tuple(map(int, matches[-1]))
|
|
131
|
-
grid_pinpoints = [
|
|
132
|
-
(i, j) for i in range(range_start[0], range_end[0] + 1) for j in range(range_start[1], range_end[1] + 1)
|
|
133
|
-
]
|
|
134
|
-
grid_pinpoints = [[dim * patch_size for dim in pair] for pair in grid_pinpoints]
|
|
135
|
-
|
|
136
|
-
if type(grid_pinpoints) is list:
|
|
137
|
-
possible_resolutions = grid_pinpoints
|
|
138
|
-
else:
|
|
139
|
-
possible_resolutions = ast.literal_eval(grid_pinpoints) # type: ignore
|
|
140
|
-
best_resolution = select_best_resolution(image.size, possible_resolutions)
|
|
141
|
-
|
|
142
|
-
# image_padded = resize_and_pad_image(image, best_resolution)
|
|
143
|
-
image_padded = image.resize(best_resolution)
|
|
144
|
-
|
|
145
|
-
patches = divide_to_patches(image_padded, processor.crop_size["height"])
|
|
146
|
-
|
|
147
|
-
image_original_resize = image.resize((processor.crop_size["height"], processor.crop_size["height"]))
|
|
148
|
-
|
|
149
|
-
image_patches = [image_original_resize] + patches
|
|
150
|
-
image_patches = [processor.preprocess(image_patch, return_tensors="pt")["pixel_values"][0] for image_patch in image_patches]
|
|
151
|
-
return torch.stack(image_patches, dim=0)
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
def process_images(images, image_processor, model_cfg):
|
|
155
|
-
image_aspect_ratio = getattr(model_cfg, "image_aspect_ratio", "")
|
|
156
|
-
new_images = []
|
|
157
|
-
if image_aspect_ratio == "pad":
|
|
158
|
-
for image in images:
|
|
159
|
-
image = expand2square(image, tuple(int(x * 255) for x in image_processor.image_mean))
|
|
160
|
-
image = image_processor.preprocess(image, return_tensors="pt")["pixel_values"][0]
|
|
161
|
-
new_images.append(image)
|
|
162
|
-
elif image_aspect_ratio == "anyres" or "anyres_max" in image_aspect_ratio:
|
|
163
|
-
for image in images:
|
|
164
|
-
image = process_anyres_image(image, image_processor, model_cfg.image_grid_pinpoints)
|
|
165
|
-
new_images.append(image)
|
|
166
|
-
else:
|
|
167
|
-
return image_processor(images, return_tensors="pt")["pixel_values"]
|
|
168
|
-
if all(x.shape == new_images[0].shape for x in new_images):
|
|
169
|
-
new_images = torch.stack(new_images, dim=0)
|
|
170
|
-
return new_images
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
class Mineru2ImageProcessor(BaseImageProcessor):
|
|
174
|
-
model_input_names = ["pixel_values"]
|
|
175
|
-
|
|
176
|
-
def __init__(
|
|
177
|
-
self,
|
|
178
|
-
image_mean=(0.5, 0.5, 0.5),
|
|
179
|
-
image_std=(0.5, 0.5, 0.5),
|
|
180
|
-
size=(384, 384),
|
|
181
|
-
crop_size: Optional[Dict[str, int]] = None,
|
|
182
|
-
resample=PILImageResampling.BICUBIC,
|
|
183
|
-
rescale_factor=1 / 255,
|
|
184
|
-
data_format=ChannelDimension.FIRST,
|
|
185
|
-
image_aspect_ratio: Optional[str] = None,
|
|
186
|
-
image_grid_pinpoints: Optional[list] = None,
|
|
187
|
-
**kwargs,
|
|
188
|
-
) -> None:
|
|
189
|
-
super().__init__(**kwargs)
|
|
190
|
-
|
|
191
|
-
crop_size = crop_size if crop_size is not None else {"height": 384, "width": 384}
|
|
192
|
-
crop_size = get_size_dict(crop_size, default_to_square=True, param_name="crop_size")
|
|
193
|
-
|
|
194
|
-
self.image_mean = image_mean
|
|
195
|
-
self.image_std = image_std
|
|
196
|
-
self.size = size
|
|
197
|
-
self.resample = resample
|
|
198
|
-
self.rescale_factor = rescale_factor
|
|
199
|
-
self.data_format = data_format
|
|
200
|
-
self.crop_size = crop_size
|
|
201
|
-
self.image_aspect_ratio = image_aspect_ratio
|
|
202
|
-
self.image_grid_pinpoints = image_grid_pinpoints
|
|
203
|
-
self.in_e2e_processing = False
|
|
204
|
-
|
|
205
|
-
def _preprocess(self, images):
|
|
206
|
-
if isinstance(images, Image.Image):
|
|
207
|
-
images = [images]
|
|
208
|
-
else:
|
|
209
|
-
# to adapt video data
|
|
210
|
-
images = [to_numpy_array(image) for image in images]
|
|
211
|
-
assert isinstance(images, list)
|
|
212
|
-
|
|
213
|
-
transforms = [
|
|
214
|
-
convert_to_rgb,
|
|
215
|
-
to_numpy_array,
|
|
216
|
-
partial(resize, size=self.size, resample=self.resample, data_format=self.data_format),
|
|
217
|
-
partial(rescale, scale=self.rescale_factor, data_format=self.data_format),
|
|
218
|
-
partial(normalize, mean=self.image_mean, std=self.image_std, data_format=self.data_format),
|
|
219
|
-
partial(to_channel_dimension_format, channel_dim=self.data_format, input_channel_dim=self.data_format),
|
|
220
|
-
]
|
|
221
|
-
|
|
222
|
-
images = reduce(lambda x, f: [*map(f, x)], transforms, images)
|
|
223
|
-
return {"pixel_values": images}
|
|
224
|
-
|
|
225
|
-
def _preprocess_end_to_end(self, images):
|
|
226
|
-
image_aspect_ratio = self.image_aspect_ratio
|
|
227
|
-
image_grid_pinpoints = self.image_grid_pinpoints
|
|
228
|
-
assert image_aspect_ratio is not None
|
|
229
|
-
assert image_grid_pinpoints is not None
|
|
230
|
-
|
|
231
|
-
pixel_values = []
|
|
232
|
-
if image_aspect_ratio == "pad":
|
|
233
|
-
for image in images:
|
|
234
|
-
image = expand2square(image, tuple(int(x * 255) for x in self.image_mean))
|
|
235
|
-
image = self._preprocess(image)["pixel_values"][0]
|
|
236
|
-
pixel_values.append(image)
|
|
237
|
-
elif image_aspect_ratio == "anyres" or "anyres_max" in image_aspect_ratio:
|
|
238
|
-
for image in images:
|
|
239
|
-
image = process_anyres_image(image, self, self.image_grid_pinpoints)
|
|
240
|
-
pixel_values.append(image.numpy())
|
|
241
|
-
else:
|
|
242
|
-
pixel_values = self._preprocess(images)["pixel_values"]
|
|
243
|
-
|
|
244
|
-
if isinstance(pixel_values, list) and all(x.shape == pixel_values[0].shape for x in pixel_values):
|
|
245
|
-
pixel_values = np.stack(pixel_values, axis=0)
|
|
246
|
-
|
|
247
|
-
# CAUTION: here used (height, width).
|
|
248
|
-
image_sizes = [(image.height, image.width) for image in images]
|
|
249
|
-
assert len(pixel_values) == len(image_sizes)
|
|
250
|
-
|
|
251
|
-
return {"pixel_values": pixel_values, "image_sizes": image_sizes}
|
|
252
|
-
|
|
253
|
-
def preprocess(
|
|
254
|
-
self,
|
|
255
|
-
images,
|
|
256
|
-
return_tensors: Optional[Union[str, TensorType]] = None,
|
|
257
|
-
**kwargs,
|
|
258
|
-
):
|
|
259
|
-
if self.image_aspect_ratio is None or self.in_e2e_processing:
|
|
260
|
-
data = self._preprocess(images)
|
|
261
|
-
else:
|
|
262
|
-
assert self.image_grid_pinpoints is not None
|
|
263
|
-
self.in_e2e_processing = True
|
|
264
|
-
try:
|
|
265
|
-
data = self._preprocess_end_to_end(images)
|
|
266
|
-
finally:
|
|
267
|
-
self.in_e2e_processing = False
|
|
268
|
-
|
|
269
|
-
return BatchFeature(data=data, tensor_type=return_tensors)
|