doc-page-extractor 0.2.2__py3-none-any.whl → 0.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of doc-page-extractor might be problematic. Click here for more details.

@@ -1,198 +0,0 @@
1
- import torch
2
-
3
- from torch import nn
4
- from transformers import AutoModel, AutoTokenizer, AutoImageProcessor, GenerationConfig
5
-
6
- from .conversation import get_conv_template
7
-
8
- class InternVL(nn.Module):
9
- def __init__(
10
- self,
11
- model_path='U4R/StructTable-InternVL2-1B',
12
- max_new_tokens=1024,
13
- max_time=30,
14
- flash_attn=True,
15
- cache_dir=None,
16
- local_files_only=None,
17
- **kwargs,
18
- ):
19
- super().__init__()
20
- self.model_path = model_path
21
- self.max_new_tokens = max_new_tokens
22
- self.max_generate_time = max_time
23
- self.flash_attn = flash_attn
24
- self.cache_dir = cache_dir
25
- self.local_files_only = local_files_only
26
-
27
- # init model and image processor from ckpt path
28
- self.init_tokenizer(model_path)
29
- self.init_image_processor(model_path)
30
- self.init_model(model_path)
31
-
32
- self.prompt_template = {
33
- 'latex': '<latex>',
34
- 'html': '<html>',
35
- 'markdown': '<markdown>',
36
- }
37
- # support output format
38
- self.supported_output_format = ['latex', 'html', 'markdown']
39
-
40
- def init_model(self, model_path):
41
- self.model = AutoModel.from_pretrained(
42
- pretrained_model_name_or_path=model_path,
43
- trust_remote_code=True,
44
- torch_dtype=torch.bfloat16,
45
- low_cpu_mem_usage=True,
46
- use_flash_attn=self.flash_attn,
47
- cache_dir=self.cache_dir,
48
- local_files_only=self.local_files_only,
49
- )
50
- self.model.eval()
51
-
52
- def init_image_processor(self, image_processor_path):
53
- self.image_processor = AutoImageProcessor.from_pretrained(
54
- pretrained_model_name_or_path=image_processor_path,
55
- trust_remote_code=True,
56
- cache_dir=self.cache_dir,
57
- local_files_only=self.local_files_only,
58
- )
59
-
60
- def init_tokenizer(self, tokenizer_path):
61
- self.tokenizer = AutoTokenizer.from_pretrained(
62
- pretrained_model_name_or_path=tokenizer_path,
63
- trust_remote_code=True,
64
- use_fast=False,
65
- cache_dir=self.cache_dir,
66
- local_files_only=self.local_files_only,
67
- )
68
-
69
- self.image_context_token = '<IMG_CONTEXT>'
70
- self.image_token_num = 256
71
- self.image_start_token = '<img>'
72
- self.image_end_token = '</img>'
73
- self.img_context_token_id = self.tokenizer.convert_tokens_to_ids(self.image_context_token)
74
-
75
- def format_image_tokens(self, path_num):
76
- return f'{self.image_start_token}{self.image_context_token* self.image_token_num * path_num}{self.image_end_token}'
77
-
78
- def forward(self, images, output_format='latex', **kwargs):
79
- # process image to tokens
80
- if not isinstance(images, list):
81
- images = [images]
82
-
83
- pixel_values_list = []
84
- for image in images:
85
- path_images = self.dynamic_preprocess(
86
- image, image_size=448, max_num=12
87
- )
88
- pixel_values = self.image_processor(
89
- path_images,
90
- return_tensors='pt'
91
- )['pixel_values'].to(torch.bfloat16)
92
- pixel_values_list.append(pixel_values)
93
-
94
- batch_size = len(pixel_values_list)
95
- conversation_list = []
96
- for bs_idx in range(batch_size):
97
- pixel_values= pixel_values_list[bs_idx].to(torch.bfloat16)
98
-
99
- image_tokens = self.format_image_tokens(pixel_values.shape[0])
100
- question = '<image>\n' + self.prompt_template[output_format]
101
- answer = None
102
-
103
- template = get_conv_template(self.model.config.template)
104
- template.append_message(template.roles[0], question)
105
- template.append_message(template.roles[1], answer)
106
- conversation = template.get_prompt()
107
- conversation = conversation.replace('<image>', image_tokens, 1)
108
- conversation_list.append(conversation)
109
-
110
- device = next(self.parameters()).device
111
- self.tokenizer.padding_side = 'left'
112
- model_inputs = self.tokenizer(
113
- conversation_list,
114
- return_tensors='pt',
115
- padding=True,
116
- max_length=self.tokenizer.model_max_length,
117
- truncation=True,
118
- ).to(device)
119
- pixel_values = torch.cat(pixel_values_list, axis=0).to(device)
120
-
121
- # generation config
122
- generation_config = dict(
123
- max_new_tokens=self.max_new_tokens,
124
- max_time=self.max_generate_time,
125
- img_context_token_id=self.img_context_token_id,
126
- pad_token_id=self.tokenizer.pad_token_id,
127
- eos_token_id=self.tokenizer.eos_token_id,
128
- do_sample=False,
129
- no_repeat_ngram_size=20,
130
- )
131
-
132
- # generate text from image tokens
133
- model_output = self.model.generate(
134
- pixel_values=pixel_values,
135
- input_ids=model_inputs.input_ids,
136
- attention_mask=model_inputs.attention_mask,
137
- **generation_config,
138
- # **kwargs
139
- )
140
-
141
- batch_decode_texts = self.tokenizer.batch_decode(
142
- model_output,
143
- skip_special_tokens=True
144
- )
145
- return batch_decode_texts
146
-
147
- def find_closest_aspect_ratio(self, aspect_ratio, target_ratios, width, height, image_size):
148
- best_ratio_diff = float('inf')
149
- best_ratio = (1, 1)
150
- area = width * height
151
- for ratio in target_ratios:
152
- target_aspect_ratio = ratio[0] / ratio[1]
153
- ratio_diff = abs(aspect_ratio - target_aspect_ratio)
154
- if ratio_diff < best_ratio_diff:
155
- best_ratio_diff = ratio_diff
156
- best_ratio = ratio
157
- elif ratio_diff == best_ratio_diff:
158
- if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
159
- best_ratio = ratio
160
- return best_ratio
161
-
162
- def dynamic_preprocess(self, image, min_num=1, max_num=12, image_size=448, use_thumbnail=True):
163
- orig_width, orig_height = image.size
164
- aspect_ratio = orig_width / orig_height
165
-
166
- # calculate the existing image aspect ratio
167
- target_ratios = set(
168
- (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
169
- i * j <= max_num and i * j >= min_num)
170
- target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
171
-
172
- # find the closest aspect ratio to the target
173
- target_aspect_ratio = self.find_closest_aspect_ratio(
174
- aspect_ratio, target_ratios, orig_width, orig_height, image_size)
175
-
176
- # calculate the target width and height
177
- target_width = image_size * target_aspect_ratio[0]
178
- target_height = image_size * target_aspect_ratio[1]
179
- blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
180
-
181
- # resize the image
182
- resized_img = image.resize((target_width, target_height))
183
- processed_images = []
184
- for i in range(blocks):
185
- box = (
186
- (i % (target_width // image_size)) * image_size,
187
- (i // (target_width // image_size)) * image_size,
188
- ((i % (target_width // image_size)) + 1) * image_size,
189
- ((i // (target_width // image_size)) + 1) * image_size
190
- )
191
- # split the image
192
- split_img = resized_img.crop(box)
193
- processed_images.append(split_img)
194
- assert len(processed_images) == blocks
195
- if use_thumbnail and len(processed_images) != 1:
196
- thumbnail_img = image.resize((image_size, image_size))
197
- processed_images.append(thumbnail_img)
198
- return processed_images
@@ -1,81 +0,0 @@
1
- import torch
2
- from torch import nn
3
-
4
- from transformers import AutoTokenizer
5
- try:
6
- from lmdeploy import pipeline, GenerationConfig, PytorchEngineConfig, ChatTemplateConfig
7
- except:
8
- print("\033[93mimport lmdeploy failed, if do not use lmdeploy, ignore this message\033[0m")
9
-
10
-
11
- class InternVL_LMDeploy(nn.Module):
12
- def __init__(
13
- self,
14
- model_path='U4R/StructTable-InternVL2-1B',
15
- max_new_tokens=1024,
16
- batch_size=4,
17
- cache_dir=None,
18
- local_files_only=None,
19
- **kwargs,
20
- ):
21
- super().__init__()
22
- self.model_path = model_path
23
- self.max_new_tokens = max_new_tokens
24
- self.max_batch_size = batch_size
25
- self.cache_dir = cache_dir
26
- self.local_files_only = local_files_only
27
-
28
- # init model and tokenizer from ckpt path
29
- self.init_tokenizer(model_path)
30
- self.init_model(model_path)
31
-
32
- self.prompt_template = {
33
- 'latex': '<latex>',
34
- 'html': '<html>',
35
- 'markdown': '<markdown>',
36
- }
37
- # support output format
38
- self.supported_output_format = ['latex', 'html', 'markdown']
39
-
40
- def init_tokenizer(self, tokenizer_path):
41
- self.tokenizer = AutoTokenizer.from_pretrained(
42
- pretrained_model_name_or_path=tokenizer_path,
43
- trust_remote_code=True,
44
- use_fast=False,
45
- cache_dir=self.cache_dir,
46
- local_files_only=self.local_files_only,
47
- )
48
-
49
- def init_model(self, model_path):
50
- engine_config = PytorchEngineConfig(
51
- dtype='bfloat16',
52
- max_batch_size=self.max_batch_size,
53
- cache_max_entry_count=0.1
54
- )
55
- self.pipeline = pipeline(
56
- model_path,
57
- backend_config=engine_config,
58
- chat_template_config=ChatTemplateConfig(model_name='internvl2-internlm2')
59
- )
60
-
61
- def forward(self, images, output_format='latex', **kwargs):
62
- # process image to tokens
63
- if not isinstance(images, list):
64
- images = [images]
65
-
66
- prompts = [self.prompt_template[output_format]] * len(images)
67
- generation_config = GenerationConfig(
68
- max_new_tokens=self.max_new_tokens,
69
- do_sample=False,
70
- temperature=1.0,
71
- stop_token_ids=[self.tokenizer.eos_token_id],
72
- )
73
-
74
- responses = self.pipeline(
75
- [(x, y) for x, y in zip(prompts, images)],
76
- gen_config=generation_config,
77
- )
78
- batch_decode_texts = [responce.text for responce in responses]
79
- return batch_decode_texts
80
-
81
-
@@ -1,3 +0,0 @@
1
- from .pix2s import Pix2Struct
2
- from .pix2s_trt import Pix2StructTensorRT
3
-
@@ -1,76 +0,0 @@
1
- import torch
2
-
3
- from torch import nn
4
- from transformers import AutoModelForVision2Seq, AutoProcessor
5
-
6
-
7
- class Pix2Struct(nn.Module):
8
- def __init__(
9
- self,
10
- model_path='U4R/StructTable-base',
11
- max_new_tokens=1024,
12
- max_time=30,
13
- cache_dir=None,
14
- local_files_only=None,
15
- **kwargs,
16
- ):
17
- super().__init__()
18
- self.model_path = model_path
19
- self.max_new_tokens = max_new_tokens
20
- self.max_generate_time = max_time
21
- self.cache_dir = cache_dir
22
- self.local_files_only = local_files_only
23
-
24
- # init model and image processor from ckpt path
25
- self.init_image_processor(model_path)
26
- self.init_model(model_path)
27
-
28
- self.special_str_list = ['\\midrule', '\\hline']
29
- self.supported_output_format = ['latex']
30
-
31
- def postprocess_latex_code(self, code):
32
- for special_str in self.special_str_list:
33
- code = code.replace(special_str, special_str + ' ')
34
- return code
35
-
36
- def init_model(self, model_path):
37
- self.model = AutoModelForVision2Seq.from_pretrained(
38
- pretrained_model_name_or_path=model_path,
39
- cache_dir=self.cache_dir,
40
- local_files_only=self.local_files_only,
41
- )
42
- self.model.eval()
43
-
44
- def init_image_processor(self, image_processor_path):
45
- self.data_processor = AutoProcessor.from_pretrained(
46
- pretrained_model_name_or_path=image_processor_path,
47
- cache_dir=self.cache_dir,
48
- local_files_only=self.local_files_only,
49
- )
50
-
51
- def forward(self, image, **kwargs):
52
- # process image to tokens
53
- image_tokens = self.data_processor.image_processor(
54
- images=image,
55
- return_tensors='pt',
56
- )
57
-
58
- device = next(self.parameters()).device
59
- for k, v in image_tokens.items():
60
- image_tokens[k] = v.to(device)
61
-
62
- # generate text from image tokens
63
- model_output = self.model.generate(
64
- flattened_patches=image_tokens['flattened_patches'],
65
- attention_mask=image_tokens['attention_mask'],
66
- max_new_tokens=self.max_new_tokens,
67
- max_time=self.max_generate_time,
68
- no_repeat_ngram_size=20,
69
- )
70
-
71
- latex_codes = self.data_processor.batch_decode(model_output, skip_special_tokens=True)
72
- # postprocess
73
- for i, code in enumerate(latex_codes):
74
- latex_codes[i] = self.postprocess_latex_code(code)
75
-
76
- return latex_codes