mineru 2.2.2__py3-none-any.whl → 2.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. mineru/backend/pipeline/pipeline_middle_json_mkcontent.py +3 -3
  2. mineru/backend/vlm/model_output_to_middle_json.py +123 -0
  3. mineru/backend/vlm/vlm_analyze.py +97 -16
  4. mineru/backend/vlm/vlm_magic_model.py +201 -135
  5. mineru/backend/vlm/vlm_middle_json_mkcontent.py +52 -11
  6. mineru/cli/client.py +6 -5
  7. mineru/cli/common.py +17 -16
  8. mineru/cli/fast_api.py +9 -7
  9. mineru/cli/gradio_app.py +15 -16
  10. mineru/cli/vlm_vllm_server.py +4 -0
  11. mineru/model/table/rec/unet_table/main.py +8 -0
  12. mineru/model/vlm_vllm_model/__init__.py +0 -0
  13. mineru/model/vlm_vllm_model/server.py +51 -0
  14. mineru/resources/header.html +10 -2
  15. mineru/utils/draw_bbox.py +32 -10
  16. mineru/utils/enum_class.py +16 -2
  17. mineru/utils/guess_suffix_or_lang.py +20 -0
  18. mineru/utils/span_block_fix.py +4 -2
  19. mineru/version.py +1 -1
  20. {mineru-2.2.2.dist-info → mineru-2.5.0.dist-info}/METADATA +70 -25
  21. {mineru-2.2.2.dist-info → mineru-2.5.0.dist-info}/RECORD +25 -38
  22. {mineru-2.2.2.dist-info → mineru-2.5.0.dist-info}/entry_points.txt +1 -1
  23. mineru/backend/vlm/base_predictor.py +0 -186
  24. mineru/backend/vlm/hf_predictor.py +0 -217
  25. mineru/backend/vlm/predictor.py +0 -111
  26. mineru/backend/vlm/sglang_client_predictor.py +0 -443
  27. mineru/backend/vlm/sglang_engine_predictor.py +0 -246
  28. mineru/backend/vlm/token_to_middle_json.py +0 -122
  29. mineru/backend/vlm/utils.py +0 -40
  30. mineru/cli/vlm_sglang_server.py +0 -4
  31. mineru/model/vlm_hf_model/__init__.py +0 -9
  32. mineru/model/vlm_hf_model/configuration_mineru2.py +0 -38
  33. mineru/model/vlm_hf_model/image_processing_mineru2.py +0 -269
  34. mineru/model/vlm_hf_model/modeling_mineru2.py +0 -449
  35. mineru/model/vlm_sglang_model/__init__.py +0 -14
  36. mineru/model/vlm_sglang_model/engine.py +0 -264
  37. mineru/model/vlm_sglang_model/image_processor.py +0 -213
  38. mineru/model/vlm_sglang_model/logit_processor.py +0 -90
  39. mineru/model/vlm_sglang_model/model.py +0 -453
  40. mineru/model/vlm_sglang_model/server.py +0 -75
  41. {mineru-2.2.2.dist-info → mineru-2.5.0.dist-info}/WHEEL +0 -0
  42. {mineru-2.2.2.dist-info → mineru-2.5.0.dist-info}/licenses/LICENSE.md +0 -0
  43. {mineru-2.2.2.dist-info → mineru-2.5.0.dist-info}/top_level.txt +0 -0
@@ -1,122 +0,0 @@
1
- import os
2
- import time
3
- from loguru import logger
4
- import numpy as np
5
- import cv2
6
- from mineru.utils.config_reader import get_llm_aided_config, get_table_enable
7
- from mineru.utils.cut_image import cut_image_and_table
8
- from mineru.utils.enum_class import ContentType
9
- from mineru.utils.hash_utils import str_md5
10
- from mineru.backend.vlm.vlm_magic_model import MagicModel
11
- from mineru.utils.pdf_image_tools import get_crop_img
12
- from mineru.utils.pdf_reader import base64_to_pil_image
13
- from mineru.utils.table_merge import merge_table
14
- from mineru.version import __version__
15
-
16
- heading_level_import_success = False
17
- llm_aided_config = get_llm_aided_config()
18
- if llm_aided_config:
19
- title_aided_config = llm_aided_config.get('title_aided', {})
20
- if title_aided_config.get('enable', False):
21
- try:
22
- from mineru.utils.llm_aided import llm_aided_title
23
- from mineru.backend.pipeline.model_init import AtomModelSingleton
24
- heading_level_import_success = True
25
- except Exception as e:
26
- logger.warning("The heading level feature cannot be used. If you need to use the heading level feature, "
27
- "please execute `pip install mineru[core]` to install the required packages.")
28
-
29
-
30
- def token_to_page_info(token, image_dict, page, image_writer, page_index) -> dict:
31
- """将token转换为页面信息"""
32
- # 解析token,提取坐标和类型
33
- # 假设token格式为:<|box_start|>x0 y0 x1 y1<|box_end|><|ref_start|>type<|ref_end|><|md_start|>content<|md_end|>
34
- # 这里需要根据实际的token格式进行解析
35
- # 提取所有完整块,每个块从<|box_start|>开始到<|md_end|>或<|im_end|>结束
36
-
37
- scale = image_dict["scale"]
38
- # page_pil_img = image_dict["img_pil"]
39
- page_pil_img = base64_to_pil_image(image_dict["img_base64"])
40
- page_img_md5 = str_md5(image_dict["img_base64"])
41
- width, height = map(int, page.get_size())
42
-
43
- magic_model = MagicModel(token, width, height)
44
- image_blocks = magic_model.get_image_blocks()
45
- table_blocks = magic_model.get_table_blocks()
46
- title_blocks = magic_model.get_title_blocks()
47
-
48
- # 如果有标题优化需求,则对title_blocks截图det
49
- if heading_level_import_success:
50
- atom_model_manager = AtomModelSingleton()
51
- ocr_model = atom_model_manager.get_atom_model(
52
- atom_model_name='ocr',
53
- ocr_show_log=False,
54
- det_db_box_thresh=0.3,
55
- lang='ch_lite'
56
- )
57
- for title_block in title_blocks:
58
- title_pil_img = get_crop_img(title_block['bbox'], page_pil_img, scale)
59
- title_np_img = np.array(title_pil_img)
60
- # 给title_pil_img添加上下左右各50像素白边padding
61
- title_np_img = cv2.copyMakeBorder(
62
- title_np_img, 50, 50, 50, 50, cv2.BORDER_CONSTANT, value=[255, 255, 255]
63
- )
64
- title_img = cv2.cvtColor(title_np_img, cv2.COLOR_RGB2BGR)
65
- ocr_det_res = ocr_model.ocr(title_img, rec=False)[0]
66
- if len(ocr_det_res) > 0:
67
- # 计算所有res的平均高度
68
- avg_height = np.mean([box[2][1] - box[0][1] for box in ocr_det_res])
69
- title_block['line_avg_height'] = round(avg_height/scale)
70
-
71
- text_blocks = magic_model.get_text_blocks()
72
- interline_equation_blocks = magic_model.get_interline_equation_blocks()
73
-
74
- all_spans = magic_model.get_all_spans()
75
- # 对image/table/interline_equation的span截图
76
- for span in all_spans:
77
- if span["type"] in [ContentType.IMAGE, ContentType.TABLE, ContentType.INTERLINE_EQUATION]:
78
- span = cut_image_and_table(span, page_pil_img, page_img_md5, page_index, image_writer, scale=scale)
79
-
80
- page_blocks = []
81
- page_blocks.extend([*image_blocks, *table_blocks, *title_blocks, *text_blocks, *interline_equation_blocks])
82
- # 对page_blocks根据index的值进行排序
83
- page_blocks.sort(key=lambda x: x["index"])
84
-
85
- page_info = {"para_blocks": page_blocks, "discarded_blocks": [], "page_size": [width, height], "page_idx": page_index}
86
- return page_info
87
-
88
-
89
- def result_to_middle_json(token_list, images_list, pdf_doc, image_writer):
90
- middle_json = {"pdf_info": [], "_backend":"vlm", "_version_name": __version__}
91
- for index, token in enumerate(token_list):
92
- page = pdf_doc[index]
93
- image_dict = images_list[index]
94
- page_info = token_to_page_info(token, image_dict, page, image_writer, index)
95
- middle_json["pdf_info"].append(page_info)
96
-
97
- """表格跨页合并"""
98
- table_enable = get_table_enable(os.getenv('MINERU_VLM_TABLE_ENABLE', 'True').lower() == 'true')
99
- if table_enable:
100
- merge_table(middle_json["pdf_info"])
101
-
102
- """llm优化标题分级"""
103
- if heading_level_import_success:
104
- llm_aided_title_start_time = time.time()
105
- llm_aided_title(middle_json["pdf_info"], title_aided_config)
106
- logger.info(f'llm aided title time: {round(time.time() - llm_aided_title_start_time, 2)}')
107
-
108
- # 关闭pdf文档
109
- pdf_doc.close()
110
- return middle_json
111
-
112
-
113
- if __name__ == "__main__":
114
-
115
- output = r"<|box_start|>088 119 472 571<|box_end|><|ref_start|>image<|ref_end|><|md_start|>![]('img_url')<|md_end|>\n<|box_start|>079 582 482 608<|box_end|><|ref_start|>image_caption<|ref_end|><|md_start|>Fig. 2. (a) Schematic of the change in the FDC over time, and (b) definition of model parameters.<|md_end|>\n<|box_start|>079 624 285 638<|box_end|><|ref_start|>title<|ref_end|><|md_start|># 2.2. Zero flow day analysis<|md_end|>\n<|box_start|>079 656 482 801<|box_end|><|ref_start|>text<|ref_end|><|md_start|>A notable feature of Fig. 1 is the increase in the number of zero flow days. A similar approach to Eq. (2), using an inverse sigmoidal function was employed to assess the impact of afforestation on the number of zero flow days per year \((N_{\mathrm{zero}})\). In this case, the left hand side of Eq. (2) is replaced by \(N_{\mathrm{zero}}\) and \(b\) and \(S\) are constrained to negative as \(N_{\mathrm{zero}}\) decreases as rainfall increases, and increases with plantation growth:<|md_end|>\n<|box_start|>076 813 368 853<|box_end|><|ref_start|>equation<|ref_end|><|md_start|>\[\nN_{\mathrm{zero}}=a+b(\Delta P)+\frac{Y}{1+\exp\left(\frac{T-T_{\mathrm{half}}}{S}\right)}\n\]<|md_end|>\n<|box_start|>079 865 482 895<|box_end|><|ref_start|>text<|ref_end|><|md_start|>For the average pre-treatment condition \(\Delta P=0\) and \(T=0\), \(N_{\mathrm{zero}}\) approximately equals \(a\). \(Y\) gives<|md_end|>\n<|box_start|>525 119 926 215<|box_end|><|ref_start|>text<|ref_end|><|md_start|>the magnitude of change in zero flow days due to afforestation, and \(S\) describes the shape of the response. For the average climate condition \(\Delta P=0\), \(a+Y\) becomes the number of zero flow days when the new equilibrium condition under afforestation is reached.<|md_end|>\n<|box_start|>525 240 704 253<|box_end|><|ref_start|>title<|ref_end|><|md_start|># 2.3. Statistical analyses<|md_end|>\n<|box_start|>525 271 926 368<|box_end|><|ref_start|>text<|ref_end|><|md_start|>The coefficient of efficiency \((E)\) (Nash and Sutcliffe, 1970; Chiew and McMahon, 1993; Legates and McCabe, 1999) was used as the 'goodness of fit' measure to evaluate the fit between observed and predicted flow deciles (2) and zero flow days (3). \(E\) is given by:<|md_end|>\n<|box_start|>520 375 735 415<|box_end|><|ref_start|>equation<|ref_end|><|md_start|>\[\nE=1.0-\frac{\sum_{i=1}^{N}(O_{i}-P_{i})^{2}}{\sum_{i=1}^{N}(O_{i}-\bar{O})^{2}}\n\]<|md_end|>\n<|box_start|>525 424 926 601<|box_end|><|ref_start|>text<|ref_end|><|md_start|>where \(O\) are observed data, \(P\) are predicted values, and \(\bar{O}\) is the mean for the entire period. \(E\) is unity minus the ratio of the mean square error to the variance in the observed data, and ranges from \(-\infty\) to 1.0. Higher values indicate greater agreement between observed and predicted data as per the coefficient of determination \((r^{2})\). \(E\) is used in preference to \(r^{2}\) in evaluating hydrologic modelling because it is a measure of the deviation from the 1:1 line. As \(E\) is always \(<r^{2}\) we have arbitrarily considered \(E>0.7\) to indicate adequate model fits.<|md_end|>\n<|box_start|>525 603 926 731<|box_end|><|ref_start|>text<|ref_end|><|md_start|>It is important to assess the significance of the model parameters to check the model assumptions that rainfall and forest age are driving changes in the FDC. The model (2) was split into simplified forms, where only the rainfall or time terms were included by setting \(b=0\), as shown in Eq. (5), or \(Y=0\) as shown in Eq. (6). The component models (5) and (6) were then tested against the complete model, (2).<|md_end|>\n<|box_start|>520 739 735 778<|box_end|><|ref_start|>equation<|ref_end|><|md_start|>\[\nQ_{\%}=a+\frac{Y}{1+\exp\left(\frac{T-T_{\mathrm{half}}^{\prime}}{S}\right)}\n\]<|md_end|>\n<|box_start|>525 787 553 799<|box_end|><|ref_start|>text<|ref_end|><|md_start|>and<|md_end|>\n<|box_start|>520 807 646 825<|box_end|><|ref_start|>equation<|ref_end|><|md_start|>\[\nQ_{\%}=a+b\Delta P\n\]<|md_end|>\n<|box_start|>525 833 926 895<|box_end|><|ref_start|>text<|ref_end|><|md_start|>For both the flow duration curve analysis and zero flow days analysis, a \(t\)-test was then performed to test whether (5) and (6) were significantly different to (2). A critical value of \(t\) exceeding the calculated \(t\)-value<|md_end|><|im_end|>"
116
-
117
- p_info = token_to_page_info(output)
118
- # 将blocks 转换为json文本
119
- import json
120
-
121
- json_str = json.dumps(p_info, ensure_ascii=False, indent=4)
122
- print(json_str)
@@ -1,40 +0,0 @@
1
- import os
2
- import re
3
- from base64 import b64decode
4
-
5
- import httpx
6
-
7
- _timeout = int(os.getenv("REQUEST_TIMEOUT", "3"))
8
- _file_exts = (".png", ".jpg", ".jpeg", ".webp", ".gif", ".pdf")
9
- _data_uri_regex = re.compile(r"^data:[^;,]+;base64,")
10
-
11
-
12
- def load_resource(uri: str) -> bytes:
13
- if uri.startswith("http://") or uri.startswith("https://"):
14
- response = httpx.get(uri, timeout=_timeout)
15
- return response.content
16
- if uri.startswith("file://"):
17
- with open(uri[len("file://") :], "rb") as file:
18
- return file.read()
19
- if uri.lower().endswith(_file_exts):
20
- with open(uri, "rb") as file:
21
- return file.read()
22
- if re.match(_data_uri_regex, uri):
23
- return b64decode(uri.split(",")[1])
24
- return b64decode(uri)
25
-
26
-
27
- async def aio_load_resource(uri: str) -> bytes:
28
- if uri.startswith("http://") or uri.startswith("https://"):
29
- async with httpx.AsyncClient(timeout=_timeout) as client:
30
- response = await client.get(uri)
31
- return response.content
32
- if uri.startswith("file://"):
33
- with open(uri[len("file://") :], "rb") as file:
34
- return file.read()
35
- if uri.lower().endswith(_file_exts):
36
- with open(uri, "rb") as file:
37
- return file.read()
38
- if re.match(_data_uri_regex, uri):
39
- return b64decode(uri.split(",")[1])
40
- return b64decode(uri)
@@ -1,4 +0,0 @@
1
- from ..model.vlm_sglang_model.server import main
2
-
3
- if __name__ == "__main__":
4
- main()
@@ -1,9 +0,0 @@
1
- from transformers import AutoConfig, AutoImageProcessor, AutoModelForCausalLM
2
-
3
- from .configuration_mineru2 import Mineru2QwenConfig
4
- from .image_processing_mineru2 import Mineru2ImageProcessor
5
- from .modeling_mineru2 import Mineru2QwenForCausalLM
6
-
7
- AutoConfig.register(Mineru2QwenConfig.model_type, Mineru2QwenConfig)
8
- AutoModelForCausalLM.register(Mineru2QwenConfig, Mineru2QwenForCausalLM)
9
- AutoImageProcessor.register(Mineru2QwenConfig, slow_image_processor_class=Mineru2ImageProcessor)
@@ -1,38 +0,0 @@
1
- from transformers import Qwen2Config
2
-
3
-
4
- class Mineru2QwenConfig(Qwen2Config):
5
- model_type = "mineru2_qwen"
6
-
7
- def __init__(
8
- self,
9
- ignore_index=-100,
10
- image_aspect_ratio="square_anyres_max_9",
11
- image_grid_pinpoints="(1x1),...,(4x4)",
12
- image_token_index=151646,
13
- mm_hidden_size=1152,
14
- mm_patch_merge_type="spatial_unpad",
15
- mm_projector_type="mlp2x_gelu",
16
- mm_vision_select_feature="full",
17
- mm_vision_select_layer=-2,
18
- mm_vision_tower="google/siglip-so400m-patch14-384",
19
- tie_word_embeddings=False,
20
- tokenizer_model_max_length=16384,
21
- tokenizer_padding_side="right",
22
- unfreeze_mm_vision_tower=True,
23
- **kwargs,
24
- ):
25
- self.ignore_index = ignore_index
26
- self.image_aspect_ratio = image_aspect_ratio
27
- self.image_grid_pinpoints = image_grid_pinpoints
28
- self.image_token_index = image_token_index
29
- self.mm_hidden_size = mm_hidden_size
30
- self.mm_patch_merge_type = mm_patch_merge_type
31
- self.mm_projector_type = mm_projector_type
32
- self.mm_vision_select_feature = mm_vision_select_feature
33
- self.mm_vision_select_layer = mm_vision_select_layer
34
- self.mm_vision_tower = mm_vision_tower
35
- self.tokenizer_model_max_length = tokenizer_model_max_length
36
- self.tokenizer_padding_side = tokenizer_padding_side
37
- self.unfreeze_mm_vision_tower = unfreeze_mm_vision_tower
38
- super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
@@ -1,269 +0,0 @@
1
- import ast
2
- import math
3
- import re
4
- from functools import partial, reduce
5
- from typing import Dict, Optional, Union
6
-
7
- import numpy as np
8
- import torch
9
- from PIL import Image
10
- from transformers.image_processing_utils import (
11
- BaseImageProcessor,
12
- BatchFeature,
13
- get_size_dict,
14
- )
15
- from transformers.image_transforms import (
16
- convert_to_rgb,
17
- normalize,
18
- rescale,
19
- resize,
20
- to_channel_dimension_format,
21
- )
22
- from transformers.image_utils import (
23
- ChannelDimension,
24
- PILImageResampling,
25
- to_numpy_array,
26
- )
27
- from transformers.utils import TensorType
28
-
29
-
30
- def select_best_resolution(original_size: tuple, possible_resolutions: list) -> tuple:
31
- original_width, original_height = original_size
32
- best_fit = (0, 0)
33
- max_effective_resolution = 0
34
- min_wasted_resolution = float("inf")
35
-
36
- for width, height in possible_resolutions:
37
- scale = min(width / original_width, height / original_height)
38
- downscaled_width, downscaled_height = int(original_width * scale), int(original_height * scale)
39
- effective_resolution = min(downscaled_width * downscaled_height, original_width * original_height)
40
- wasted_resolution = (width * height) - effective_resolution
41
-
42
- if effective_resolution > max_effective_resolution or (
43
- effective_resolution == max_effective_resolution and wasted_resolution < min_wasted_resolution
44
- ):
45
- max_effective_resolution = effective_resolution
46
- min_wasted_resolution = wasted_resolution
47
- best_fit = (width, height)
48
-
49
- return best_fit
50
-
51
-
52
- def divide_to_patches(image, patch_size):
53
- patches = []
54
- width, height = image.size
55
- for i in range(0, height, patch_size):
56
- for j in range(0, width, patch_size):
57
- box = (j, i, j + patch_size, i + patch_size)
58
- patch = image.crop(box)
59
- patches.append(patch)
60
- return patches
61
-
62
-
63
- def expand2square(pil_img, background_color):
64
- width, height = pil_img.size
65
- if width == height:
66
- return pil_img
67
- if pil_img.mode == "L":
68
- pil_img = pil_img.convert("RGB")
69
- if width > height:
70
- result = Image.new(pil_img.mode, (width, width), background_color)
71
- result.paste(pil_img, (0, (width - height) // 2))
72
- return result
73
- else:
74
- result = Image.new(pil_img.mode, (height, height), background_color)
75
- result.paste(pil_img, ((height - width) // 2, 0))
76
- return result
77
-
78
-
79
- def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size):
80
- if isinstance(grid_pinpoints, str) and "x" in grid_pinpoints:
81
- assert patch_size in [224, 336, 384, 448, 512], "patch_size should be in [224, 336, 384, 448, 512]"
82
- matches = re.findall(r"\((\d+)x(\d+)\)", grid_pinpoints)
83
- range_start = tuple(map(int, matches[0]))
84
- range_end = tuple(map(int, matches[-1]))
85
- grid_pinpoints = [
86
- (i, j) for i in range(range_start[0], range_end[0] + 1) for j in range(range_start[1], range_end[1] + 1)
87
- ]
88
- grid_pinpoints = [[dim * patch_size for dim in pair] for pair in grid_pinpoints]
89
- if type(grid_pinpoints) is list:
90
- possible_resolutions = grid_pinpoints
91
- else:
92
- possible_resolutions = ast.literal_eval(grid_pinpoints) # type: ignore
93
- width, height = select_best_resolution(image_size, possible_resolutions)
94
- return width // patch_size, height // patch_size
95
-
96
-
97
- # This functions is not used.
98
- def resize_and_pad_image(image, target_resolution):
99
- original_width, original_height = image.size
100
- target_width, target_height = target_resolution
101
-
102
- scale_w = target_width / original_width
103
- scale_h = target_height / original_height
104
-
105
- if scale_w < scale_h:
106
- new_width = target_width
107
- new_height = min(math.ceil(original_height * scale_w), target_height)
108
- else:
109
- new_height = target_height
110
- new_width = min(math.ceil(original_width * scale_h), target_width)
111
-
112
- # Resize the image
113
- resized_image = image.resize((new_width, new_height))
114
-
115
- new_image = Image.new("RGB", (target_width, target_height), (0, 0, 0))
116
- paste_x = (target_width - new_width) // 2
117
- paste_y = (target_height - new_height) // 2
118
- new_image.paste(resized_image, (paste_x, paste_y))
119
-
120
- return new_image
121
-
122
-
123
- # DIFFERENT from sglang.srt.mm_utils.process_anyres_image
124
- def process_anyres_image(image, processor, grid_pinpoints):
125
- if isinstance(grid_pinpoints, str) and "x" in grid_pinpoints:
126
- patch_size = processor.crop_size["height"]
127
- assert patch_size in [224, 336, 384, 448, 512], "patch_size should be in [224, 336, 384, 448, 512]"
128
- matches = re.findall(r"\((\d+)x(\d+)\)", grid_pinpoints)
129
- range_start = tuple(map(int, matches[0]))
130
- range_end = tuple(map(int, matches[-1]))
131
- grid_pinpoints = [
132
- (i, j) for i in range(range_start[0], range_end[0] + 1) for j in range(range_start[1], range_end[1] + 1)
133
- ]
134
- grid_pinpoints = [[dim * patch_size for dim in pair] for pair in grid_pinpoints]
135
-
136
- if type(grid_pinpoints) is list:
137
- possible_resolutions = grid_pinpoints
138
- else:
139
- possible_resolutions = ast.literal_eval(grid_pinpoints) # type: ignore
140
- best_resolution = select_best_resolution(image.size, possible_resolutions)
141
-
142
- # image_padded = resize_and_pad_image(image, best_resolution)
143
- image_padded = image.resize(best_resolution)
144
-
145
- patches = divide_to_patches(image_padded, processor.crop_size["height"])
146
-
147
- image_original_resize = image.resize((processor.crop_size["height"], processor.crop_size["height"]))
148
-
149
- image_patches = [image_original_resize] + patches
150
- image_patches = [processor.preprocess(image_patch, return_tensors="pt")["pixel_values"][0] for image_patch in image_patches]
151
- return torch.stack(image_patches, dim=0)
152
-
153
-
154
- def process_images(images, image_processor, model_cfg):
155
- image_aspect_ratio = getattr(model_cfg, "image_aspect_ratio", "")
156
- new_images = []
157
- if image_aspect_ratio == "pad":
158
- for image in images:
159
- image = expand2square(image, tuple(int(x * 255) for x in image_processor.image_mean))
160
- image = image_processor.preprocess(image, return_tensors="pt")["pixel_values"][0]
161
- new_images.append(image)
162
- elif image_aspect_ratio == "anyres" or "anyres_max" in image_aspect_ratio:
163
- for image in images:
164
- image = process_anyres_image(image, image_processor, model_cfg.image_grid_pinpoints)
165
- new_images.append(image)
166
- else:
167
- return image_processor(images, return_tensors="pt")["pixel_values"]
168
- if all(x.shape == new_images[0].shape for x in new_images):
169
- new_images = torch.stack(new_images, dim=0)
170
- return new_images
171
-
172
-
173
- class Mineru2ImageProcessor(BaseImageProcessor):
174
- model_input_names = ["pixel_values"]
175
-
176
- def __init__(
177
- self,
178
- image_mean=(0.5, 0.5, 0.5),
179
- image_std=(0.5, 0.5, 0.5),
180
- size=(384, 384),
181
- crop_size: Optional[Dict[str, int]] = None,
182
- resample=PILImageResampling.BICUBIC,
183
- rescale_factor=1 / 255,
184
- data_format=ChannelDimension.FIRST,
185
- image_aspect_ratio: Optional[str] = None,
186
- image_grid_pinpoints: Optional[list] = None,
187
- **kwargs,
188
- ) -> None:
189
- super().__init__(**kwargs)
190
-
191
- crop_size = crop_size if crop_size is not None else {"height": 384, "width": 384}
192
- crop_size = get_size_dict(crop_size, default_to_square=True, param_name="crop_size")
193
-
194
- self.image_mean = image_mean
195
- self.image_std = image_std
196
- self.size = size
197
- self.resample = resample
198
- self.rescale_factor = rescale_factor
199
- self.data_format = data_format
200
- self.crop_size = crop_size
201
- self.image_aspect_ratio = image_aspect_ratio
202
- self.image_grid_pinpoints = image_grid_pinpoints
203
- self.in_e2e_processing = False
204
-
205
- def _preprocess(self, images):
206
- if isinstance(images, Image.Image):
207
- images = [images]
208
- else:
209
- # to adapt video data
210
- images = [to_numpy_array(image) for image in images]
211
- assert isinstance(images, list)
212
-
213
- transforms = [
214
- convert_to_rgb,
215
- to_numpy_array,
216
- partial(resize, size=self.size, resample=self.resample, data_format=self.data_format),
217
- partial(rescale, scale=self.rescale_factor, data_format=self.data_format),
218
- partial(normalize, mean=self.image_mean, std=self.image_std, data_format=self.data_format),
219
- partial(to_channel_dimension_format, channel_dim=self.data_format, input_channel_dim=self.data_format),
220
- ]
221
-
222
- images = reduce(lambda x, f: [*map(f, x)], transforms, images)
223
- return {"pixel_values": images}
224
-
225
- def _preprocess_end_to_end(self, images):
226
- image_aspect_ratio = self.image_aspect_ratio
227
- image_grid_pinpoints = self.image_grid_pinpoints
228
- assert image_aspect_ratio is not None
229
- assert image_grid_pinpoints is not None
230
-
231
- pixel_values = []
232
- if image_aspect_ratio == "pad":
233
- for image in images:
234
- image = expand2square(image, tuple(int(x * 255) for x in self.image_mean))
235
- image = self._preprocess(image)["pixel_values"][0]
236
- pixel_values.append(image)
237
- elif image_aspect_ratio == "anyres" or "anyres_max" in image_aspect_ratio:
238
- for image in images:
239
- image = process_anyres_image(image, self, self.image_grid_pinpoints)
240
- pixel_values.append(image.numpy())
241
- else:
242
- pixel_values = self._preprocess(images)["pixel_values"]
243
-
244
- if isinstance(pixel_values, list) and all(x.shape == pixel_values[0].shape for x in pixel_values):
245
- pixel_values = np.stack(pixel_values, axis=0)
246
-
247
- # CAUTION: here used (height, width).
248
- image_sizes = [(image.height, image.width) for image in images]
249
- assert len(pixel_values) == len(image_sizes)
250
-
251
- return {"pixel_values": pixel_values, "image_sizes": image_sizes}
252
-
253
- def preprocess(
254
- self,
255
- images,
256
- return_tensors: Optional[Union[str, TensorType]] = None,
257
- **kwargs,
258
- ):
259
- if self.image_aspect_ratio is None or self.in_e2e_processing:
260
- data = self._preprocess(images)
261
- else:
262
- assert self.image_grid_pinpoints is not None
263
- self.in_e2e_processing = True
264
- try:
265
- data = self._preprocess_end_to_end(images)
266
- finally:
267
- self.in_e2e_processing = False
268
-
269
- return BatchFeature(data=data, tensor_type=return_tensors)